]> Git Repo - linux.git/blame - net/ipv4/route.c
Merge branch 'master' of master.kernel.org:/pub/scm/linux/kernel/git/davem/net-2.6
[linux.git] / net / ipv4 / route.c
CommitLineData
1da177e4
LT
1/*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * ROUTE - implementation of the IP router.
7 *
02c30a84 8 * Authors: Ross Biro
1da177e4
LT
9 * Fred N. van Kempen, <[email protected]>
10 * Alan Cox, <[email protected]>
11 * Linus Torvalds, <[email protected]>
12 * Alexey Kuznetsov, <[email protected]>
13 *
14 * Fixes:
15 * Alan Cox : Verify area fixes.
16 * Alan Cox : cli() protects routing changes
17 * Rui Oliveira : ICMP routing table updates
18 * ([email protected]) Routing table insertion and update
19 * Linus Torvalds : Rewrote bits to be sensible
20 * Alan Cox : Added BSD route gw semantics
e905a9ed 21 * Alan Cox : Super /proc >4K
1da177e4
LT
22 * Alan Cox : MTU in route table
23 * Alan Cox : MSS actually. Also added the window
24 * clamper.
25 * Sam Lantinga : Fixed route matching in rt_del()
26 * Alan Cox : Routing cache support.
27 * Alan Cox : Removed compatibility cruft.
28 * Alan Cox : RTF_REJECT support.
29 * Alan Cox : TCP irtt support.
30 * Jonathan Naylor : Added Metric support.
31 * Miquel van Smoorenburg : BSD API fixes.
32 * Miquel van Smoorenburg : Metrics.
33 * Alan Cox : Use __u32 properly
34 * Alan Cox : Aligned routing errors more closely with BSD
35 * our system is still very different.
36 * Alan Cox : Faster /proc handling
37 * Alexey Kuznetsov : Massive rework to support tree based routing,
38 * routing caches and better behaviour.
e905a9ed 39 *
1da177e4
LT
40 * Olaf Erb : irtt wasn't being copied right.
41 * Bjorn Ekwall : Kerneld route support.
42 * Alan Cox : Multicast fixed (I hope)
43 * Pavel Krauz : Limited broadcast fixed
44 * Mike McLagan : Routing by source
45 * Alexey Kuznetsov : End of old history. Split to fib.c and
46 * route.c and rewritten from scratch.
47 * Andi Kleen : Load-limit warning messages.
48 * Vitaly E. Lavrov : Transparent proxy revived after year coma.
49 * Vitaly E. Lavrov : Race condition in ip_route_input_slow.
50 * Tobias Ringstrom : Uninitialized res.type in ip_route_output_slow.
51 * Vladimir V. Ivanov : IP rule info (flowid) is really useful.
52 * Marc Boucher : routing by fwmark
53 * Robert Olsson : Added rt_cache statistics
54 * Arnaldo C. Melo : Convert proc stuff to seq_file
bb1d23b0 55 * Eric Dumazet : hashed spinlocks and rt_check_expire() fixes.
cef2685e
IS
56 * Ilia Sotnikov : Ignore TOS on PMTUD and Redirect
57 * Ilia Sotnikov : Removed TOS from hash calculations
1da177e4
LT
58 *
59 * This program is free software; you can redistribute it and/or
60 * modify it under the terms of the GNU General Public License
61 * as published by the Free Software Foundation; either version
62 * 2 of the License, or (at your option) any later version.
63 */
64
1da177e4
LT
65#include <linux/module.h>
66#include <asm/uaccess.h>
67#include <asm/system.h>
68#include <linux/bitops.h>
69#include <linux/types.h>
70#include <linux/kernel.h>
1da177e4 71#include <linux/mm.h>
424c4b70 72#include <linux/bootmem.h>
1da177e4
LT
73#include <linux/string.h>
74#include <linux/socket.h>
75#include <linux/sockios.h>
76#include <linux/errno.h>
77#include <linux/in.h>
78#include <linux/inet.h>
79#include <linux/netdevice.h>
80#include <linux/proc_fs.h>
81#include <linux/init.h>
39c90ece 82#include <linux/workqueue.h>
1da177e4 83#include <linux/skbuff.h>
1da177e4
LT
84#include <linux/inetdevice.h>
85#include <linux/igmp.h>
86#include <linux/pkt_sched.h>
87#include <linux/mroute.h>
88#include <linux/netfilter_ipv4.h>
89#include <linux/random.h>
90#include <linux/jhash.h>
91#include <linux/rcupdate.h>
92#include <linux/times.h>
5a0e3ad6 93#include <linux/slab.h>
352e512c 94#include <net/dst.h>
457c4cbc 95#include <net/net_namespace.h>
1da177e4
LT
96#include <net/protocol.h>
97#include <net/ip.h>
98#include <net/route.h>
99#include <net/inetpeer.h>
100#include <net/sock.h>
101#include <net/ip_fib.h>
102#include <net/arp.h>
103#include <net/tcp.h>
104#include <net/icmp.h>
105#include <net/xfrm.h>
8d71740c 106#include <net/netevent.h>
63f3444f 107#include <net/rtnetlink.h>
1da177e4
LT
108#ifdef CONFIG_SYSCTL
109#include <linux/sysctl.h>
110#endif
111
112#define RT_FL_TOS(oldflp) \
113 ((u32)(oldflp->fl4_tos & (IPTOS_RT_MASK | RTO_ONLINK)))
114
115#define IP_MAX_MTU 0xFFF0
116
117#define RT_GC_TIMEOUT (300*HZ)
118
1da177e4 119static int ip_rt_max_size;
817bc4db
SH
120static int ip_rt_gc_timeout __read_mostly = RT_GC_TIMEOUT;
121static int ip_rt_gc_interval __read_mostly = 60 * HZ;
122static int ip_rt_gc_min_interval __read_mostly = HZ / 2;
123static int ip_rt_redirect_number __read_mostly = 9;
124static int ip_rt_redirect_load __read_mostly = HZ / 50;
125static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
126static int ip_rt_error_cost __read_mostly = HZ;
127static int ip_rt_error_burst __read_mostly = 5 * HZ;
128static int ip_rt_gc_elasticity __read_mostly = 8;
129static int ip_rt_mtu_expires __read_mostly = 10 * 60 * HZ;
130static int ip_rt_min_pmtu __read_mostly = 512 + 20 + 20;
131static int ip_rt_min_advmss __read_mostly = 256;
1080d709 132static int rt_chain_length_max __read_mostly = 20;
1da177e4 133
125bb8f5
ED
134static struct delayed_work expires_work;
135static unsigned long expires_ljiffies;
1da177e4
LT
136
137/*
138 * Interface to generic destination cache.
139 */
140
141static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
0dbaee3b 142static unsigned int ipv4_default_advmss(const struct dst_entry *dst);
d33e4553 143static unsigned int ipv4_default_mtu(const struct dst_entry *dst);
1da177e4 144static void ipv4_dst_destroy(struct dst_entry *dst);
1da177e4
LT
145static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
146static void ipv4_link_failure(struct sk_buff *skb);
147static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
569d3645 148static int rt_garbage_collect(struct dst_ops *ops);
1da177e4 149
72cdd1d9
ED
150static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
151 int how)
152{
153}
1da177e4 154
62fa8a84
DM
155static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
156{
06582540
DM
157 struct rtable *rt = (struct rtable *) dst;
158 struct inet_peer *peer;
159 u32 *p = NULL;
160
161 if (!rt->peer)
162 rt_bind_peer(rt, 1);
62fa8a84 163
06582540
DM
164 peer = rt->peer;
165 if (peer) {
62fa8a84
DM
166 u32 *old_p = __DST_METRICS_PTR(old);
167 unsigned long prev, new;
168
06582540
DM
169 p = peer->metrics;
170 if (inet_metrics_new(peer))
171 memcpy(p, old_p, sizeof(u32) * RTAX_MAX);
62fa8a84
DM
172
173 new = (unsigned long) p;
174 prev = cmpxchg(&dst->_metrics, old, new);
175
176 if (prev != old) {
62fa8a84
DM
177 p = __DST_METRICS_PTR(prev);
178 if (prev & DST_METRICS_READ_ONLY)
179 p = NULL;
180 } else {
62fa8a84
DM
181 if (rt->fi) {
182 fib_info_put(rt->fi);
183 rt->fi = NULL;
184 }
185 }
186 }
187 return p;
188}
189
1da177e4
LT
190static struct dst_ops ipv4_dst_ops = {
191 .family = AF_INET,
09640e63 192 .protocol = cpu_to_be16(ETH_P_IP),
1da177e4
LT
193 .gc = rt_garbage_collect,
194 .check = ipv4_dst_check,
0dbaee3b 195 .default_advmss = ipv4_default_advmss,
d33e4553 196 .default_mtu = ipv4_default_mtu,
62fa8a84 197 .cow_metrics = ipv4_cow_metrics,
1da177e4
LT
198 .destroy = ipv4_dst_destroy,
199 .ifdown = ipv4_dst_ifdown,
200 .negative_advice = ipv4_negative_advice,
201 .link_failure = ipv4_link_failure,
202 .update_pmtu = ip_rt_update_pmtu,
1ac06e03 203 .local_out = __ip_local_out,
1da177e4
LT
204};
205
206#define ECN_OR_COST(class) TC_PRIO_##class
207
4839c52b 208const __u8 ip_tos2prio[16] = {
1da177e4
LT
209 TC_PRIO_BESTEFFORT,
210 ECN_OR_COST(FILLER),
211 TC_PRIO_BESTEFFORT,
212 ECN_OR_COST(BESTEFFORT),
213 TC_PRIO_BULK,
214 ECN_OR_COST(BULK),
215 TC_PRIO_BULK,
216 ECN_OR_COST(BULK),
217 TC_PRIO_INTERACTIVE,
218 ECN_OR_COST(INTERACTIVE),
219 TC_PRIO_INTERACTIVE,
220 ECN_OR_COST(INTERACTIVE),
221 TC_PRIO_INTERACTIVE_BULK,
222 ECN_OR_COST(INTERACTIVE_BULK),
223 TC_PRIO_INTERACTIVE_BULK,
224 ECN_OR_COST(INTERACTIVE_BULK)
225};
226
227
228/*
229 * Route cache.
230 */
231
232/* The locking scheme is rather straight forward:
233 *
234 * 1) Read-Copy Update protects the buckets of the central route hash.
235 * 2) Only writers remove entries, and they hold the lock
236 * as they look at rtable reference counts.
237 * 3) Only readers acquire references to rtable entries,
238 * they do so with atomic increments and with the
239 * lock held.
240 */
241
242struct rt_hash_bucket {
1c31720a 243 struct rtable __rcu *chain;
22c047cc 244};
1080d709 245
8a25d5de
IM
246#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
247 defined(CONFIG_PROVE_LOCKING)
22c047cc
ED
248/*
249 * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
250 * The size of this table is a power of two and depends on the number of CPUS.
62051200 251 * (on lockdep we have a quite big spinlock_t, so keep the size down there)
22c047cc 252 */
62051200
IM
253#ifdef CONFIG_LOCKDEP
254# define RT_HASH_LOCK_SZ 256
22c047cc 255#else
62051200
IM
256# if NR_CPUS >= 32
257# define RT_HASH_LOCK_SZ 4096
258# elif NR_CPUS >= 16
259# define RT_HASH_LOCK_SZ 2048
260# elif NR_CPUS >= 8
261# define RT_HASH_LOCK_SZ 1024
262# elif NR_CPUS >= 4
263# define RT_HASH_LOCK_SZ 512
264# else
265# define RT_HASH_LOCK_SZ 256
266# endif
22c047cc
ED
267#endif
268
269static spinlock_t *rt_hash_locks;
270# define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
1ff1cc20
PE
271
272static __init void rt_hash_lock_init(void)
273{
274 int i;
275
276 rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ,
277 GFP_KERNEL);
278 if (!rt_hash_locks)
279 panic("IP: failed to allocate rt_hash_locks\n");
280
281 for (i = 0; i < RT_HASH_LOCK_SZ; i++)
282 spin_lock_init(&rt_hash_locks[i]);
283}
22c047cc
ED
284#else
285# define rt_hash_lock_addr(slot) NULL
1ff1cc20
PE
286
287static inline void rt_hash_lock_init(void)
288{
289}
22c047cc 290#endif
1da177e4 291
817bc4db
SH
292static struct rt_hash_bucket *rt_hash_table __read_mostly;
293static unsigned rt_hash_mask __read_mostly;
294static unsigned int rt_hash_log __read_mostly;
1da177e4 295
2f970d83 296static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
27f39c73 297#define RT_CACHE_STAT_INC(field) __this_cpu_inc(rt_cache_stat.field)
1da177e4 298
b00180de 299static inline unsigned int rt_hash(__be32 daddr, __be32 saddr, int idx,
0eae88f3 300 int genid)
1da177e4 301{
0eae88f3 302 return jhash_3words((__force u32)daddr, (__force u32)saddr,
b00180de 303 idx, genid)
29e75252 304 & rt_hash_mask;
1da177e4
LT
305}
306
e84f84f2
DL
307static inline int rt_genid(struct net *net)
308{
309 return atomic_read(&net->ipv4.rt_genid);
310}
311
1da177e4
LT
312#ifdef CONFIG_PROC_FS
313struct rt_cache_iter_state {
a75e936f 314 struct seq_net_private p;
1da177e4 315 int bucket;
29e75252 316 int genid;
1da177e4
LT
317};
318
1218854a 319static struct rtable *rt_cache_get_first(struct seq_file *seq)
1da177e4 320{
1218854a 321 struct rt_cache_iter_state *st = seq->private;
1da177e4 322 struct rtable *r = NULL;
1da177e4
LT
323
324 for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
1c31720a 325 if (!rcu_dereference_raw(rt_hash_table[st->bucket].chain))
a6272665 326 continue;
1da177e4 327 rcu_read_lock_bh();
a898def2 328 r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
29e75252 329 while (r) {
d8d1f30b 330 if (dev_net(r->dst.dev) == seq_file_net(seq) &&
a75e936f 331 r->rt_genid == st->genid)
29e75252 332 return r;
d8d1f30b 333 r = rcu_dereference_bh(r->dst.rt_next);
29e75252 334 }
1da177e4
LT
335 rcu_read_unlock_bh();
336 }
29e75252 337 return r;
1da177e4
LT
338}
339
1218854a 340static struct rtable *__rt_cache_get_next(struct seq_file *seq,
642d6318 341 struct rtable *r)
1da177e4 342{
1218854a 343 struct rt_cache_iter_state *st = seq->private;
a6272665 344
1c31720a 345 r = rcu_dereference_bh(r->dst.rt_next);
1da177e4
LT
346 while (!r) {
347 rcu_read_unlock_bh();
a6272665
ED
348 do {
349 if (--st->bucket < 0)
350 return NULL;
1c31720a 351 } while (!rcu_dereference_raw(rt_hash_table[st->bucket].chain));
1da177e4 352 rcu_read_lock_bh();
1c31720a 353 r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
1da177e4 354 }
1c31720a 355 return r;
1da177e4
LT
356}
357
1218854a 358static struct rtable *rt_cache_get_next(struct seq_file *seq,
642d6318
DL
359 struct rtable *r)
360{
1218854a
YH
361 struct rt_cache_iter_state *st = seq->private;
362 while ((r = __rt_cache_get_next(seq, r)) != NULL) {
d8d1f30b 363 if (dev_net(r->dst.dev) != seq_file_net(seq))
a75e936f 364 continue;
642d6318
DL
365 if (r->rt_genid == st->genid)
366 break;
367 }
368 return r;
369}
370
1218854a 371static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
1da177e4 372{
1218854a 373 struct rtable *r = rt_cache_get_first(seq);
1da177e4
LT
374
375 if (r)
1218854a 376 while (pos && (r = rt_cache_get_next(seq, r)))
1da177e4
LT
377 --pos;
378 return pos ? NULL : r;
379}
380
381static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
382{
29e75252 383 struct rt_cache_iter_state *st = seq->private;
29e75252 384 if (*pos)
1218854a 385 return rt_cache_get_idx(seq, *pos - 1);
e84f84f2 386 st->genid = rt_genid(seq_file_net(seq));
29e75252 387 return SEQ_START_TOKEN;
1da177e4
LT
388}
389
390static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
391{
29e75252 392 struct rtable *r;
1da177e4
LT
393
394 if (v == SEQ_START_TOKEN)
1218854a 395 r = rt_cache_get_first(seq);
1da177e4 396 else
1218854a 397 r = rt_cache_get_next(seq, v);
1da177e4
LT
398 ++*pos;
399 return r;
400}
401
402static void rt_cache_seq_stop(struct seq_file *seq, void *v)
403{
404 if (v && v != SEQ_START_TOKEN)
405 rcu_read_unlock_bh();
406}
407
408static int rt_cache_seq_show(struct seq_file *seq, void *v)
409{
410 if (v == SEQ_START_TOKEN)
411 seq_printf(seq, "%-127s\n",
412 "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
413 "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
414 "HHUptod\tSpecDst");
415 else {
416 struct rtable *r = v;
5e659e4c 417 int len;
1da177e4 418
0eae88f3
ED
419 seq_printf(seq, "%s\t%08X\t%08X\t%8X\t%d\t%u\t%d\t"
420 "%08X\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X%n",
d8d1f30b 421 r->dst.dev ? r->dst.dev->name : "*",
0eae88f3
ED
422 (__force u32)r->rt_dst,
423 (__force u32)r->rt_gateway,
d8d1f30b
CG
424 r->rt_flags, atomic_read(&r->dst.__refcnt),
425 r->dst.__use, 0, (__force u32)r->rt_src,
0dbaee3b 426 dst_metric_advmss(&r->dst) + 40,
d8d1f30b
CG
427 dst_metric(&r->dst, RTAX_WINDOW),
428 (int)((dst_metric(&r->dst, RTAX_RTT) >> 3) +
429 dst_metric(&r->dst, RTAX_RTTVAR)),
1da177e4 430 r->fl.fl4_tos,
d8d1f30b
CG
431 r->dst.hh ? atomic_read(&r->dst.hh->hh_refcnt) : -1,
432 r->dst.hh ? (r->dst.hh->hh_output ==
1da177e4 433 dev_queue_xmit) : 0,
5e659e4c
PE
434 r->rt_spec_dst, &len);
435
436 seq_printf(seq, "%*s\n", 127 - len, "");
e905a9ed
YH
437 }
438 return 0;
1da177e4
LT
439}
440
f690808e 441static const struct seq_operations rt_cache_seq_ops = {
1da177e4
LT
442 .start = rt_cache_seq_start,
443 .next = rt_cache_seq_next,
444 .stop = rt_cache_seq_stop,
445 .show = rt_cache_seq_show,
446};
447
448static int rt_cache_seq_open(struct inode *inode, struct file *file)
449{
a75e936f 450 return seq_open_net(inode, file, &rt_cache_seq_ops,
cf7732e4 451 sizeof(struct rt_cache_iter_state));
1da177e4
LT
452}
453
9a32144e 454static const struct file_operations rt_cache_seq_fops = {
1da177e4
LT
455 .owner = THIS_MODULE,
456 .open = rt_cache_seq_open,
457 .read = seq_read,
458 .llseek = seq_lseek,
a75e936f 459 .release = seq_release_net,
1da177e4
LT
460};
461
462
463static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
464{
465 int cpu;
466
467 if (*pos == 0)
468 return SEQ_START_TOKEN;
469
0f23174a 470 for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
1da177e4
LT
471 if (!cpu_possible(cpu))
472 continue;
473 *pos = cpu+1;
2f970d83 474 return &per_cpu(rt_cache_stat, cpu);
1da177e4
LT
475 }
476 return NULL;
477}
478
479static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
480{
481 int cpu;
482
0f23174a 483 for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
1da177e4
LT
484 if (!cpu_possible(cpu))
485 continue;
486 *pos = cpu+1;
2f970d83 487 return &per_cpu(rt_cache_stat, cpu);
1da177e4
LT
488 }
489 return NULL;
e905a9ed 490
1da177e4
LT
491}
492
493static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
494{
495
496}
497
498static int rt_cpu_seq_show(struct seq_file *seq, void *v)
499{
500 struct rt_cache_stat *st = v;
501
502 if (v == SEQ_START_TOKEN) {
5bec0039 503 seq_printf(seq, "entries in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src out_hit out_slow_tot out_slow_mc gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
1da177e4
LT
504 return 0;
505 }
e905a9ed 506
1da177e4
LT
507 seq_printf(seq,"%08x %08x %08x %08x %08x %08x %08x %08x "
508 " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
fc66f95c 509 dst_entries_get_slow(&ipv4_dst_ops),
1da177e4
LT
510 st->in_hit,
511 st->in_slow_tot,
512 st->in_slow_mc,
513 st->in_no_route,
514 st->in_brd,
515 st->in_martian_dst,
516 st->in_martian_src,
517
518 st->out_hit,
519 st->out_slow_tot,
e905a9ed 520 st->out_slow_mc,
1da177e4
LT
521
522 st->gc_total,
523 st->gc_ignored,
524 st->gc_goal_miss,
525 st->gc_dst_overflow,
526 st->in_hlist_search,
527 st->out_hlist_search
528 );
529 return 0;
530}
531
f690808e 532static const struct seq_operations rt_cpu_seq_ops = {
1da177e4
LT
533 .start = rt_cpu_seq_start,
534 .next = rt_cpu_seq_next,
535 .stop = rt_cpu_seq_stop,
536 .show = rt_cpu_seq_show,
537};
538
539
540static int rt_cpu_seq_open(struct inode *inode, struct file *file)
541{
542 return seq_open(file, &rt_cpu_seq_ops);
543}
544
9a32144e 545static const struct file_operations rt_cpu_seq_fops = {
1da177e4
LT
546 .owner = THIS_MODULE,
547 .open = rt_cpu_seq_open,
548 .read = seq_read,
549 .llseek = seq_lseek,
550 .release = seq_release,
551};
552
c7066f70 553#ifdef CONFIG_IP_ROUTE_CLASSID
a661c419 554static int rt_acct_proc_show(struct seq_file *m, void *v)
78c686e9 555{
a661c419
AD
556 struct ip_rt_acct *dst, *src;
557 unsigned int i, j;
558
559 dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
560 if (!dst)
561 return -ENOMEM;
562
563 for_each_possible_cpu(i) {
564 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
565 for (j = 0; j < 256; j++) {
566 dst[j].o_bytes += src[j].o_bytes;
567 dst[j].o_packets += src[j].o_packets;
568 dst[j].i_bytes += src[j].i_bytes;
569 dst[j].i_packets += src[j].i_packets;
570 }
78c686e9
PE
571 }
572
a661c419
AD
573 seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
574 kfree(dst);
575 return 0;
576}
78c686e9 577
a661c419
AD
578static int rt_acct_proc_open(struct inode *inode, struct file *file)
579{
580 return single_open(file, rt_acct_proc_show, NULL);
78c686e9 581}
a661c419
AD
582
583static const struct file_operations rt_acct_proc_fops = {
584 .owner = THIS_MODULE,
585 .open = rt_acct_proc_open,
586 .read = seq_read,
587 .llseek = seq_lseek,
588 .release = single_release,
589};
78c686e9 590#endif
107f1634 591
73b38711 592static int __net_init ip_rt_do_proc_init(struct net *net)
107f1634
PE
593{
594 struct proc_dir_entry *pde;
595
596 pde = proc_net_fops_create(net, "rt_cache", S_IRUGO,
597 &rt_cache_seq_fops);
598 if (!pde)
599 goto err1;
600
77020720
WC
601 pde = proc_create("rt_cache", S_IRUGO,
602 net->proc_net_stat, &rt_cpu_seq_fops);
107f1634
PE
603 if (!pde)
604 goto err2;
605
c7066f70 606#ifdef CONFIG_IP_ROUTE_CLASSID
a661c419 607 pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
107f1634
PE
608 if (!pde)
609 goto err3;
610#endif
611 return 0;
612
c7066f70 613#ifdef CONFIG_IP_ROUTE_CLASSID
107f1634
PE
614err3:
615 remove_proc_entry("rt_cache", net->proc_net_stat);
616#endif
617err2:
618 remove_proc_entry("rt_cache", net->proc_net);
619err1:
620 return -ENOMEM;
621}
73b38711
DL
622
623static void __net_exit ip_rt_do_proc_exit(struct net *net)
624{
625 remove_proc_entry("rt_cache", net->proc_net_stat);
626 remove_proc_entry("rt_cache", net->proc_net);
c7066f70 627#ifdef CONFIG_IP_ROUTE_CLASSID
73b38711 628 remove_proc_entry("rt_acct", net->proc_net);
0a931acf 629#endif
73b38711
DL
630}
631
632static struct pernet_operations ip_rt_proc_ops __net_initdata = {
633 .init = ip_rt_do_proc_init,
634 .exit = ip_rt_do_proc_exit,
635};
636
637static int __init ip_rt_proc_init(void)
638{
639 return register_pernet_subsys(&ip_rt_proc_ops);
640}
641
107f1634 642#else
73b38711 643static inline int ip_rt_proc_init(void)
107f1634
PE
644{
645 return 0;
646}
1da177e4 647#endif /* CONFIG_PROC_FS */
e905a9ed 648
5969f71d 649static inline void rt_free(struct rtable *rt)
1da177e4 650{
d8d1f30b 651 call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
1da177e4
LT
652}
653
5969f71d 654static inline void rt_drop(struct rtable *rt)
1da177e4 655{
1da177e4 656 ip_rt_put(rt);
d8d1f30b 657 call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
1da177e4
LT
658}
659
5969f71d 660static inline int rt_fast_clean(struct rtable *rth)
1da177e4
LT
661{
662 /* Kill broadcast/multicast entries very aggresively, if they
663 collide in hash table with more useful entries */
664 return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
c7537967 665 rt_is_input_route(rth) && rth->dst.rt_next;
1da177e4
LT
666}
667
5969f71d 668static inline int rt_valuable(struct rtable *rth)
1da177e4
LT
669{
670 return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
d8d1f30b 671 rth->dst.expires;
1da177e4
LT
672}
673
674static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
675{
676 unsigned long age;
677 int ret = 0;
678
d8d1f30b 679 if (atomic_read(&rth->dst.__refcnt))
1da177e4
LT
680 goto out;
681
682 ret = 1;
d8d1f30b
CG
683 if (rth->dst.expires &&
684 time_after_eq(jiffies, rth->dst.expires))
1da177e4
LT
685 goto out;
686
d8d1f30b 687 age = jiffies - rth->dst.lastuse;
1da177e4
LT
688 ret = 0;
689 if ((age <= tmo1 && !rt_fast_clean(rth)) ||
690 (age <= tmo2 && rt_valuable(rth)))
691 goto out;
692 ret = 1;
693out: return ret;
694}
695
696/* Bits of score are:
697 * 31: very valuable
698 * 30: not quite useless
699 * 29..0: usage counter
700 */
701static inline u32 rt_score(struct rtable *rt)
702{
d8d1f30b 703 u32 score = jiffies - rt->dst.lastuse;
1da177e4
LT
704
705 score = ~score & ~(3<<30);
706
707 if (rt_valuable(rt))
708 score |= (1<<31);
709
c7537967 710 if (rt_is_output_route(rt) ||
1da177e4
LT
711 !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
712 score |= (1<<30);
713
714 return score;
715}
716
1080d709
NH
717static inline bool rt_caching(const struct net *net)
718{
719 return net->ipv4.current_rt_cache_rebuild_count <=
720 net->ipv4.sysctl_rt_cache_rebuild_count;
721}
722
723static inline bool compare_hash_inputs(const struct flowi *fl1,
724 const struct flowi *fl2)
725{
5811662b
CG
726 return ((((__force u32)fl1->fl4_dst ^ (__force u32)fl2->fl4_dst) |
727 ((__force u32)fl1->fl4_src ^ (__force u32)fl2->fl4_src) |
1080d709
NH
728 (fl1->iif ^ fl2->iif)) == 0);
729}
730
1da177e4
LT
731static inline int compare_keys(struct flowi *fl1, struct flowi *fl2)
732{
5811662b
CG
733 return (((__force u32)fl1->fl4_dst ^ (__force u32)fl2->fl4_dst) |
734 ((__force u32)fl1->fl4_src ^ (__force u32)fl2->fl4_src) |
47dcf0cb 735 (fl1->mark ^ fl2->mark) |
5811662b 736 (*(u16 *)&fl1->fl4_tos ^ *(u16 *)&fl2->fl4_tos) |
8238b218
DM
737 (fl1->oif ^ fl2->oif) |
738 (fl1->iif ^ fl2->iif)) == 0;
1da177e4
LT
739}
740
b5921910
DL
741static inline int compare_netns(struct rtable *rt1, struct rtable *rt2)
742{
d8d1f30b 743 return net_eq(dev_net(rt1->dst.dev), dev_net(rt2->dst.dev));
b5921910
DL
744}
745
e84f84f2
DL
746static inline int rt_is_expired(struct rtable *rth)
747{
d8d1f30b 748 return rth->rt_genid != rt_genid(dev_net(rth->dst.dev));
e84f84f2
DL
749}
750
beb659bd
ED
751/*
752 * Perform a full scan of hash table and free all entries.
753 * Can be called by a softirq or a process.
754 * In the later case, we want to be reschedule if necessary
755 */
6561a3b1 756static void rt_do_flush(struct net *net, int process_context)
beb659bd
ED
757{
758 unsigned int i;
759 struct rtable *rth, *next;
760
761 for (i = 0; i <= rt_hash_mask; i++) {
6561a3b1
DM
762 struct rtable __rcu **pprev;
763 struct rtable *list;
764
beb659bd
ED
765 if (process_context && need_resched())
766 cond_resched();
1c31720a 767 rth = rcu_dereference_raw(rt_hash_table[i].chain);
beb659bd
ED
768 if (!rth)
769 continue;
770
771 spin_lock_bh(rt_hash_lock_addr(i));
32cb5b4e 772
6561a3b1
DM
773 list = NULL;
774 pprev = &rt_hash_table[i].chain;
775 rth = rcu_dereference_protected(*pprev,
1c31720a 776 lockdep_is_held(rt_hash_lock_addr(i)));
32cb5b4e 777
6561a3b1
DM
778 while (rth) {
779 next = rcu_dereference_protected(rth->dst.rt_next,
1c31720a 780 lockdep_is_held(rt_hash_lock_addr(i)));
6561a3b1
DM
781
782 if (!net ||
783 net_eq(dev_net(rth->dst.dev), net)) {
784 rcu_assign_pointer(*pprev, next);
785 rcu_assign_pointer(rth->dst.rt_next, list);
786 list = rth;
32cb5b4e 787 } else {
6561a3b1 788 pprev = &rth->dst.rt_next;
32cb5b4e 789 }
6561a3b1 790 rth = next;
32cb5b4e 791 }
6561a3b1 792
beb659bd
ED
793 spin_unlock_bh(rt_hash_lock_addr(i));
794
6561a3b1
DM
795 for (; list; list = next) {
796 next = rcu_dereference_protected(list->dst.rt_next, 1);
797 rt_free(list);
beb659bd
ED
798 }
799 }
800}
801
1080d709
NH
802/*
803 * While freeing expired entries, we compute average chain length
804 * and standard deviation, using fixed-point arithmetic.
805 * This to have an estimation of rt_chain_length_max
806 * rt_chain_length_max = max(elasticity, AVG + 4*SD)
807 * We use 3 bits for frational part, and 29 (or 61) for magnitude.
808 */
809
810#define FRACT_BITS 3
811#define ONE (1UL << FRACT_BITS)
812
98376387
ED
813/*
814 * Given a hash chain and an item in this hash chain,
815 * find if a previous entry has the same hash_inputs
816 * (but differs on tos, mark or oif)
817 * Returns 0 if an alias is found.
818 * Returns ONE if rth has no alias before itself.
819 */
820static int has_noalias(const struct rtable *head, const struct rtable *rth)
821{
822 const struct rtable *aux = head;
823
824 while (aux != rth) {
825 if (compare_hash_inputs(&aux->fl, &rth->fl))
826 return 0;
1c31720a 827 aux = rcu_dereference_protected(aux->dst.rt_next, 1);
98376387
ED
828 }
829 return ONE;
830}
831
beb659bd 832static void rt_check_expire(void)
1da177e4 833{
bb1d23b0
ED
834 static unsigned int rover;
835 unsigned int i = rover, goal;
1c31720a
ED
836 struct rtable *rth;
837 struct rtable __rcu **rthp;
cf8da764 838 unsigned long samples = 0;
1080d709 839 unsigned long sum = 0, sum2 = 0;
125bb8f5 840 unsigned long delta;
bb1d23b0
ED
841 u64 mult;
842
125bb8f5
ED
843 delta = jiffies - expires_ljiffies;
844 expires_ljiffies = jiffies;
845 mult = ((u64)delta) << rt_hash_log;
bb1d23b0
ED
846 if (ip_rt_gc_timeout > 1)
847 do_div(mult, ip_rt_gc_timeout);
848 goal = (unsigned int)mult;
39c90ece
ED
849 if (goal > rt_hash_mask)
850 goal = rt_hash_mask + 1;
bb1d23b0 851 for (; goal > 0; goal--) {
1da177e4 852 unsigned long tmo = ip_rt_gc_timeout;
cf8da764 853 unsigned long length;
1da177e4
LT
854
855 i = (i + 1) & rt_hash_mask;
856 rthp = &rt_hash_table[i].chain;
857
d90bf5a9
ED
858 if (need_resched())
859 cond_resched();
860
1080d709
NH
861 samples++;
862
1c31720a 863 if (rcu_dereference_raw(*rthp) == NULL)
bb1d23b0 864 continue;
cf8da764 865 length = 0;
39c90ece 866 spin_lock_bh(rt_hash_lock_addr(i));
1c31720a
ED
867 while ((rth = rcu_dereference_protected(*rthp,
868 lockdep_is_held(rt_hash_lock_addr(i)))) != NULL) {
d8d1f30b 869 prefetch(rth->dst.rt_next);
e84f84f2 870 if (rt_is_expired(rth)) {
d8d1f30b 871 *rthp = rth->dst.rt_next;
29e75252
ED
872 rt_free(rth);
873 continue;
874 }
d8d1f30b 875 if (rth->dst.expires) {
1da177e4 876 /* Entry is expired even if it is in use */
d8d1f30b 877 if (time_before_eq(jiffies, rth->dst.expires)) {
1ddbcb00 878nofree:
1da177e4 879 tmo >>= 1;
d8d1f30b 880 rthp = &rth->dst.rt_next;
1080d709 881 /*
1ddbcb00 882 * We only count entries on
1080d709
NH
883 * a chain with equal hash inputs once
884 * so that entries for different QOS
885 * levels, and other non-hash input
886 * attributes don't unfairly skew
887 * the length computation
888 */
98376387 889 length += has_noalias(rt_hash_table[i].chain, rth);
1da177e4
LT
890 continue;
891 }
1ddbcb00
ED
892 } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout))
893 goto nofree;
1da177e4
LT
894
895 /* Cleanup aged off entries. */
d8d1f30b 896 *rthp = rth->dst.rt_next;
e905a9ed 897 rt_free(rth);
1da177e4 898 }
39c90ece 899 spin_unlock_bh(rt_hash_lock_addr(i));
1080d709
NH
900 sum += length;
901 sum2 += length*length;
902 }
903 if (samples) {
904 unsigned long avg = sum / samples;
905 unsigned long sd = int_sqrt(sum2 / samples - avg*avg);
906 rt_chain_length_max = max_t(unsigned long,
907 ip_rt_gc_elasticity,
908 (avg + 4*sd) >> FRACT_BITS);
1da177e4
LT
909 }
910 rover = i;
beb659bd
ED
911}
912
913/*
914 * rt_worker_func() is run in process context.
29e75252 915 * we call rt_check_expire() to scan part of the hash table
beb659bd
ED
916 */
917static void rt_worker_func(struct work_struct *work)
918{
29e75252 919 rt_check_expire();
39c90ece 920 schedule_delayed_work(&expires_work, ip_rt_gc_interval);
1da177e4
LT
921}
922
29e75252
ED
923/*
924 * Pertubation of rt_genid by a small quantity [1..256]
925 * Using 8 bits of shuffling ensure we can call rt_cache_invalidate()
926 * many times (2^24) without giving recent rt_genid.
927 * Jenkins hash is strong enough that litle changes of rt_genid are OK.
1da177e4 928 */
86c657f6 929static void rt_cache_invalidate(struct net *net)
1da177e4 930{
29e75252 931 unsigned char shuffle;
1da177e4 932
29e75252 933 get_random_bytes(&shuffle, sizeof(shuffle));
e84f84f2 934 atomic_add(shuffle + 1U, &net->ipv4.rt_genid);
1da177e4
LT
935}
936
29e75252
ED
937/*
938 * delay < 0 : invalidate cache (fast : entries will be deleted later)
939 * delay >= 0 : invalidate & flush cache (can be long)
940 */
76e6ebfb 941void rt_cache_flush(struct net *net, int delay)
1da177e4 942{
86c657f6 943 rt_cache_invalidate(net);
29e75252 944 if (delay >= 0)
6561a3b1 945 rt_do_flush(net, !in_softirq());
1da177e4
LT
946}
947
a5ee1551 948/* Flush previous cache invalidated entries from the cache */
6561a3b1 949void rt_cache_flush_batch(struct net *net)
a5ee1551 950{
6561a3b1 951 rt_do_flush(net, !in_softirq());
a5ee1551
EB
952}
953
1080d709
NH
954static void rt_emergency_hash_rebuild(struct net *net)
955{
3ee94372 956 if (net_ratelimit())
1080d709 957 printk(KERN_WARNING "Route hash chain too long!\n");
3ee94372 958 rt_cache_invalidate(net);
1080d709
NH
959}
960
1da177e4
LT
961/*
962 Short description of GC goals.
963
964 We want to build algorithm, which will keep routing cache
965 at some equilibrium point, when number of aged off entries
966 is kept approximately equal to newly generated ones.
967
968 Current expiration strength is variable "expire".
969 We try to adjust it dynamically, so that if networking
970 is idle expires is large enough to keep enough of warm entries,
971 and when load increases it reduces to limit cache size.
972 */
973
569d3645 974static int rt_garbage_collect(struct dst_ops *ops)
1da177e4
LT
975{
976 static unsigned long expire = RT_GC_TIMEOUT;
977 static unsigned long last_gc;
978 static int rover;
979 static int equilibrium;
1c31720a
ED
980 struct rtable *rth;
981 struct rtable __rcu **rthp;
1da177e4
LT
982 unsigned long now = jiffies;
983 int goal;
fc66f95c 984 int entries = dst_entries_get_fast(&ipv4_dst_ops);
1da177e4
LT
985
986 /*
987 * Garbage collection is pretty expensive,
988 * do not make it too frequently.
989 */
990
991 RT_CACHE_STAT_INC(gc_total);
992
993 if (now - last_gc < ip_rt_gc_min_interval &&
fc66f95c 994 entries < ip_rt_max_size) {
1da177e4
LT
995 RT_CACHE_STAT_INC(gc_ignored);
996 goto out;
997 }
998
fc66f95c 999 entries = dst_entries_get_slow(&ipv4_dst_ops);
1da177e4 1000 /* Calculate number of entries, which we want to expire now. */
fc66f95c 1001 goal = entries - (ip_rt_gc_elasticity << rt_hash_log);
1da177e4
LT
1002 if (goal <= 0) {
1003 if (equilibrium < ipv4_dst_ops.gc_thresh)
1004 equilibrium = ipv4_dst_ops.gc_thresh;
fc66f95c 1005 goal = entries - equilibrium;
1da177e4 1006 if (goal > 0) {
b790cedd 1007 equilibrium += min_t(unsigned int, goal >> 1, rt_hash_mask + 1);
fc66f95c 1008 goal = entries - equilibrium;
1da177e4
LT
1009 }
1010 } else {
1011 /* We are in dangerous area. Try to reduce cache really
1012 * aggressively.
1013 */
b790cedd 1014 goal = max_t(unsigned int, goal >> 1, rt_hash_mask + 1);
fc66f95c 1015 equilibrium = entries - goal;
1da177e4
LT
1016 }
1017
1018 if (now - last_gc >= ip_rt_gc_min_interval)
1019 last_gc = now;
1020
1021 if (goal <= 0) {
1022 equilibrium += goal;
1023 goto work_done;
1024 }
1025
1026 do {
1027 int i, k;
1028
1029 for (i = rt_hash_mask, k = rover; i >= 0; i--) {
1030 unsigned long tmo = expire;
1031
1032 k = (k + 1) & rt_hash_mask;
1033 rthp = &rt_hash_table[k].chain;
22c047cc 1034 spin_lock_bh(rt_hash_lock_addr(k));
1c31720a
ED
1035 while ((rth = rcu_dereference_protected(*rthp,
1036 lockdep_is_held(rt_hash_lock_addr(k)))) != NULL) {
e84f84f2 1037 if (!rt_is_expired(rth) &&
29e75252 1038 !rt_may_expire(rth, tmo, expire)) {
1da177e4 1039 tmo >>= 1;
d8d1f30b 1040 rthp = &rth->dst.rt_next;
1da177e4
LT
1041 continue;
1042 }
d8d1f30b 1043 *rthp = rth->dst.rt_next;
1da177e4
LT
1044 rt_free(rth);
1045 goal--;
1da177e4 1046 }
22c047cc 1047 spin_unlock_bh(rt_hash_lock_addr(k));
1da177e4
LT
1048 if (goal <= 0)
1049 break;
1050 }
1051 rover = k;
1052
1053 if (goal <= 0)
1054 goto work_done;
1055
1056 /* Goal is not achieved. We stop process if:
1057
1058 - if expire reduced to zero. Otherwise, expire is halfed.
1059 - if table is not full.
1060 - if we are called from interrupt.
1061 - jiffies check is just fallback/debug loop breaker.
1062 We will not spin here for long time in any case.
1063 */
1064
1065 RT_CACHE_STAT_INC(gc_goal_miss);
1066
1067 if (expire == 0)
1068 break;
1069
1070 expire >>= 1;
1071#if RT_CACHE_DEBUG >= 2
1072 printk(KERN_DEBUG "expire>> %u %d %d %d\n", expire,
fc66f95c 1073 dst_entries_get_fast(&ipv4_dst_ops), goal, i);
1da177e4
LT
1074#endif
1075
fc66f95c 1076 if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
1da177e4
LT
1077 goto out;
1078 } while (!in_softirq() && time_before_eq(jiffies, now));
1079
fc66f95c
ED
1080 if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
1081 goto out;
1082 if (dst_entries_get_slow(&ipv4_dst_ops) < ip_rt_max_size)
1da177e4
LT
1083 goto out;
1084 if (net_ratelimit())
1085 printk(KERN_WARNING "dst cache overflow\n");
1086 RT_CACHE_STAT_INC(gc_dst_overflow);
1087 return 1;
1088
1089work_done:
1090 expire += ip_rt_gc_min_interval;
1091 if (expire > ip_rt_gc_timeout ||
fc66f95c
ED
1092 dst_entries_get_fast(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh ||
1093 dst_entries_get_slow(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh)
1da177e4
LT
1094 expire = ip_rt_gc_timeout;
1095#if RT_CACHE_DEBUG >= 2
1096 printk(KERN_DEBUG "expire++ %u %d %d %d\n", expire,
fc66f95c 1097 dst_entries_get_fast(&ipv4_dst_ops), goal, rover);
1da177e4
LT
1098#endif
1099out: return 0;
1100}
1101
98376387
ED
1102/*
1103 * Returns number of entries in a hash chain that have different hash_inputs
1104 */
1105static int slow_chain_length(const struct rtable *head)
1106{
1107 int length = 0;
1108 const struct rtable *rth = head;
1109
1110 while (rth) {
1111 length += has_noalias(head, rth);
1c31720a 1112 rth = rcu_dereference_protected(rth->dst.rt_next, 1);
98376387
ED
1113 }
1114 return length >> FRACT_BITS;
1115}
1116
511c3f92 1117static int rt_intern_hash(unsigned hash, struct rtable *rt,
6a2bad70 1118 struct rtable **rp, struct sk_buff *skb, int ifindex)
1da177e4 1119{
1c31720a
ED
1120 struct rtable *rth, *cand;
1121 struct rtable __rcu **rthp, **candp;
1da177e4 1122 unsigned long now;
1da177e4
LT
1123 u32 min_score;
1124 int chain_length;
1125 int attempts = !in_softirq();
1126
1127restart:
1128 chain_length = 0;
1129 min_score = ~(u32)0;
1130 cand = NULL;
1131 candp = NULL;
1132 now = jiffies;
1133
d8d1f30b 1134 if (!rt_caching(dev_net(rt->dst.dev))) {
73e42897
NH
1135 /*
1136 * If we're not caching, just tell the caller we
1137 * were successful and don't touch the route. The
1138 * caller hold the sole reference to the cache entry, and
1139 * it will be released when the caller is done with it.
1140 * If we drop it here, the callers have no way to resolve routes
1141 * when we're not caching. Instead, just point *rp at rt, so
1142 * the caller gets a single use out of the route
b6280b47
NH
1143 * Note that we do rt_free on this new route entry, so that
1144 * once its refcount hits zero, we are still able to reap it
1145 * (Thanks Alexey)
27b75c95
ED
1146 * Note: To avoid expensive rcu stuff for this uncached dst,
1147 * we set DST_NOCACHE so that dst_release() can free dst without
1148 * waiting a grace period.
73e42897 1149 */
b6280b47 1150
c7d4426a 1151 rt->dst.flags |= DST_NOCACHE;
c7537967 1152 if (rt->rt_type == RTN_UNICAST || rt_is_output_route(rt)) {
d8d1f30b 1153 int err = arp_bind_neighbour(&rt->dst);
b6280b47
NH
1154 if (err) {
1155 if (net_ratelimit())
1156 printk(KERN_WARNING
1157 "Neighbour table failure & not caching routes.\n");
27b75c95 1158 ip_rt_put(rt);
b6280b47
NH
1159 return err;
1160 }
1161 }
1162
b6280b47 1163 goto skip_hashing;
1080d709
NH
1164 }
1165
1da177e4
LT
1166 rthp = &rt_hash_table[hash].chain;
1167
22c047cc 1168 spin_lock_bh(rt_hash_lock_addr(hash));
1c31720a
ED
1169 while ((rth = rcu_dereference_protected(*rthp,
1170 lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
e84f84f2 1171 if (rt_is_expired(rth)) {
d8d1f30b 1172 *rthp = rth->dst.rt_next;
29e75252
ED
1173 rt_free(rth);
1174 continue;
1175 }
b5921910 1176 if (compare_keys(&rth->fl, &rt->fl) && compare_netns(rth, rt)) {
1da177e4 1177 /* Put it first */
d8d1f30b 1178 *rthp = rth->dst.rt_next;
1da177e4
LT
1179 /*
1180 * Since lookup is lockfree, the deletion
1181 * must be visible to another weakly ordered CPU before
1182 * the insertion at the start of the hash chain.
1183 */
d8d1f30b 1184 rcu_assign_pointer(rth->dst.rt_next,
1da177e4
LT
1185 rt_hash_table[hash].chain);
1186 /*
1187 * Since lookup is lockfree, the update writes
1188 * must be ordered for consistency on SMP.
1189 */
1190 rcu_assign_pointer(rt_hash_table[hash].chain, rth);
1191
d8d1f30b 1192 dst_use(&rth->dst, now);
22c047cc 1193 spin_unlock_bh(rt_hash_lock_addr(hash));
1da177e4
LT
1194
1195 rt_drop(rt);
511c3f92
ED
1196 if (rp)
1197 *rp = rth;
1198 else
d8d1f30b 1199 skb_dst_set(skb, &rth->dst);
1da177e4
LT
1200 return 0;
1201 }
1202
d8d1f30b 1203 if (!atomic_read(&rth->dst.__refcnt)) {
1da177e4
LT
1204 u32 score = rt_score(rth);
1205
1206 if (score <= min_score) {
1207 cand = rth;
1208 candp = rthp;
1209 min_score = score;
1210 }
1211 }
1212
1213 chain_length++;
1214
d8d1f30b 1215 rthp = &rth->dst.rt_next;
1da177e4
LT
1216 }
1217
1218 if (cand) {
1219 /* ip_rt_gc_elasticity used to be average length of chain
1220 * length, when exceeded gc becomes really aggressive.
1221 *
1222 * The second limit is less certain. At the moment it allows
1223 * only 2 entries per bucket. We will see.
1224 */
1225 if (chain_length > ip_rt_gc_elasticity) {
d8d1f30b 1226 *candp = cand->dst.rt_next;
1da177e4
LT
1227 rt_free(cand);
1228 }
1080d709 1229 } else {
98376387
ED
1230 if (chain_length > rt_chain_length_max &&
1231 slow_chain_length(rt_hash_table[hash].chain) > rt_chain_length_max) {
d8d1f30b 1232 struct net *net = dev_net(rt->dst.dev);
1080d709 1233 int num = ++net->ipv4.current_rt_cache_rebuild_count;
b35ecb5d 1234 if (!rt_caching(net)) {
1080d709 1235 printk(KERN_WARNING "%s: %d rebuilds is over limit, route caching disabled\n",
d8d1f30b 1236 rt->dst.dev->name, num);
1080d709 1237 }
b35ecb5d 1238 rt_emergency_hash_rebuild(net);
6a2bad70
PE
1239 spin_unlock_bh(rt_hash_lock_addr(hash));
1240
1241 hash = rt_hash(rt->fl.fl4_dst, rt->fl.fl4_src,
1242 ifindex, rt_genid(net));
1243 goto restart;
1080d709 1244 }
1da177e4
LT
1245 }
1246
1247 /* Try to bind route to arp only if it is output
1248 route or unicast forwarding path.
1249 */
c7537967 1250 if (rt->rt_type == RTN_UNICAST || rt_is_output_route(rt)) {
d8d1f30b 1251 int err = arp_bind_neighbour(&rt->dst);
1da177e4 1252 if (err) {
22c047cc 1253 spin_unlock_bh(rt_hash_lock_addr(hash));
1da177e4
LT
1254
1255 if (err != -ENOBUFS) {
1256 rt_drop(rt);
1257 return err;
1258 }
1259
1260 /* Neighbour tables are full and nothing
1261 can be released. Try to shrink route cache,
1262 it is most likely it holds some neighbour records.
1263 */
1264 if (attempts-- > 0) {
1265 int saved_elasticity = ip_rt_gc_elasticity;
1266 int saved_int = ip_rt_gc_min_interval;
1267 ip_rt_gc_elasticity = 1;
1268 ip_rt_gc_min_interval = 0;
569d3645 1269 rt_garbage_collect(&ipv4_dst_ops);
1da177e4
LT
1270 ip_rt_gc_min_interval = saved_int;
1271 ip_rt_gc_elasticity = saved_elasticity;
1272 goto restart;
1273 }
1274
1275 if (net_ratelimit())
7e1b33e5 1276 printk(KERN_WARNING "ipv4: Neighbour table overflow.\n");
1da177e4
LT
1277 rt_drop(rt);
1278 return -ENOBUFS;
1279 }
1280 }
1281
d8d1f30b 1282 rt->dst.rt_next = rt_hash_table[hash].chain;
1080d709 1283
1da177e4 1284#if RT_CACHE_DEBUG >= 2
d8d1f30b 1285 if (rt->dst.rt_next) {
1da177e4 1286 struct rtable *trt;
b6280b47
NH
1287 printk(KERN_DEBUG "rt_cache @%02x: %pI4",
1288 hash, &rt->rt_dst);
d8d1f30b 1289 for (trt = rt->dst.rt_next; trt; trt = trt->dst.rt_next)
673d57e7 1290 printk(" . %pI4", &trt->rt_dst);
1da177e4
LT
1291 printk("\n");
1292 }
1293#endif
00269b54
ED
1294 /*
1295 * Since lookup is lockfree, we must make sure
1296 * previous writes to rt are comitted to memory
1297 * before making rt visible to other CPUS.
1298 */
1ddbcb00 1299 rcu_assign_pointer(rt_hash_table[hash].chain, rt);
1080d709 1300
22c047cc 1301 spin_unlock_bh(rt_hash_lock_addr(hash));
73e42897 1302
b6280b47 1303skip_hashing:
511c3f92
ED
1304 if (rp)
1305 *rp = rt;
1306 else
d8d1f30b 1307 skb_dst_set(skb, &rt->dst);
1da177e4
LT
1308 return 0;
1309}
1310
1311void rt_bind_peer(struct rtable *rt, int create)
1312{
1da177e4
LT
1313 struct inet_peer *peer;
1314
b534ecf1 1315 peer = inet_getpeer_v4(rt->rt_dst, create);
1da177e4 1316
49e8ab03 1317 if (peer && cmpxchg(&rt->peer, NULL, peer) != NULL)
1da177e4
LT
1318 inet_putpeer(peer);
1319}
1320
1321/*
1322 * Peer allocation may fail only in serious out-of-memory conditions. However
1323 * we still can generate some output.
1324 * Random ID selection looks a bit dangerous because we have no chances to
1325 * select ID being unique in a reasonable period of time.
1326 * But broken packet identifier may be better than no packet at all.
1327 */
1328static void ip_select_fb_ident(struct iphdr *iph)
1329{
1330 static DEFINE_SPINLOCK(ip_fb_id_lock);
1331 static u32 ip_fallback_id;
1332 u32 salt;
1333
1334 spin_lock_bh(&ip_fb_id_lock);
e448515c 1335 salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
1da177e4
LT
1336 iph->id = htons(salt & 0xFFFF);
1337 ip_fallback_id = salt;
1338 spin_unlock_bh(&ip_fb_id_lock);
1339}
1340
1341void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
1342{
1343 struct rtable *rt = (struct rtable *) dst;
1344
1345 if (rt) {
1346 if (rt->peer == NULL)
1347 rt_bind_peer(rt, 1);
1348
1349 /* If peer is attached to destination, it is never detached,
1350 so that we need not to grab a lock to dereference it.
1351 */
1352 if (rt->peer) {
1353 iph->id = htons(inet_getid(rt->peer, more));
1354 return;
1355 }
1356 } else
e905a9ed 1357 printk(KERN_DEBUG "rt_bind_peer(0) @%p\n",
9c2b3328 1358 __builtin_return_address(0));
1da177e4
LT
1359
1360 ip_select_fb_ident(iph);
1361}
4bc2f18b 1362EXPORT_SYMBOL(__ip_select_ident);
1da177e4
LT
1363
1364static void rt_del(unsigned hash, struct rtable *rt)
1365{
1c31720a
ED
1366 struct rtable __rcu **rthp;
1367 struct rtable *aux;
1da177e4 1368
29e75252 1369 rthp = &rt_hash_table[hash].chain;
22c047cc 1370 spin_lock_bh(rt_hash_lock_addr(hash));
1da177e4 1371 ip_rt_put(rt);
1c31720a
ED
1372 while ((aux = rcu_dereference_protected(*rthp,
1373 lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
e84f84f2 1374 if (aux == rt || rt_is_expired(aux)) {
d8d1f30b 1375 *rthp = aux->dst.rt_next;
29e75252
ED
1376 rt_free(aux);
1377 continue;
1da177e4 1378 }
d8d1f30b 1379 rthp = &aux->dst.rt_next;
29e75252 1380 }
22c047cc 1381 spin_unlock_bh(rt_hash_lock_addr(hash));
1da177e4
LT
1382}
1383
ed7865a4 1384/* called in rcu_read_lock() section */
f7655229
AV
1385void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1386 __be32 saddr, struct net_device *dev)
1da177e4
LT
1387{
1388 int i, k;
ed7865a4 1389 struct in_device *in_dev = __in_dev_get_rcu(dev);
1c31720a
ED
1390 struct rtable *rth;
1391 struct rtable __rcu **rthp;
f7655229 1392 __be32 skeys[2] = { saddr, 0 };
1da177e4 1393 int ikeys[2] = { dev->ifindex, 0 };
8d71740c 1394 struct netevent_redirect netevent;
317805b8 1395 struct net *net;
1da177e4 1396
1da177e4
LT
1397 if (!in_dev)
1398 return;
1399
c346dca1 1400 net = dev_net(dev);
9d4fb27d
JP
1401 if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
1402 ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
1403 ipv4_is_zeronet(new_gw))
1da177e4
LT
1404 goto reject_redirect;
1405
1080d709
NH
1406 if (!rt_caching(net))
1407 goto reject_redirect;
1408
1da177e4
LT
1409 if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1410 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1411 goto reject_redirect;
1412 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1413 goto reject_redirect;
1414 } else {
317805b8 1415 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
1da177e4
LT
1416 goto reject_redirect;
1417 }
1418
1419 for (i = 0; i < 2; i++) {
1420 for (k = 0; k < 2; k++) {
b00180de 1421 unsigned hash = rt_hash(daddr, skeys[i], ikeys[k],
e84f84f2 1422 rt_genid(net));
1da177e4 1423
1c31720a 1424 rthp = &rt_hash_table[hash].chain;
1da177e4 1425
1da177e4
LT
1426 while ((rth = rcu_dereference(*rthp)) != NULL) {
1427 struct rtable *rt;
1428
1429 if (rth->fl.fl4_dst != daddr ||
1430 rth->fl.fl4_src != skeys[i] ||
1da177e4 1431 rth->fl.oif != ikeys[k] ||
c7537967 1432 rt_is_input_route(rth) ||
e84f84f2 1433 rt_is_expired(rth) ||
d8d1f30b
CG
1434 !net_eq(dev_net(rth->dst.dev), net)) {
1435 rthp = &rth->dst.rt_next;
1da177e4
LT
1436 continue;
1437 }
1438
1439 if (rth->rt_dst != daddr ||
1440 rth->rt_src != saddr ||
d8d1f30b 1441 rth->dst.error ||
1da177e4 1442 rth->rt_gateway != old_gw ||
d8d1f30b 1443 rth->dst.dev != dev)
1da177e4
LT
1444 break;
1445
d8d1f30b 1446 dst_hold(&rth->dst);
1da177e4
LT
1447
1448 rt = dst_alloc(&ipv4_dst_ops);
1449 if (rt == NULL) {
1450 ip_rt_put(rth);
1da177e4
LT
1451 return;
1452 }
1453
1454 /* Copy all the information. */
1455 *rt = *rth;
d8d1f30b
CG
1456 rt->dst.__use = 1;
1457 atomic_set(&rt->dst.__refcnt, 1);
1458 rt->dst.child = NULL;
1459 if (rt->dst.dev)
1460 dev_hold(rt->dst.dev);
d8d1f30b
CG
1461 rt->dst.obsolete = -1;
1462 rt->dst.lastuse = jiffies;
1463 rt->dst.path = &rt->dst;
1464 rt->dst.neighbour = NULL;
1465 rt->dst.hh = NULL;
def8b4fa 1466#ifdef CONFIG_XFRM
d8d1f30b 1467 rt->dst.xfrm = NULL;
def8b4fa 1468#endif
e84f84f2 1469 rt->rt_genid = rt_genid(net);
1da177e4
LT
1470 rt->rt_flags |= RTCF_REDIRECTED;
1471
1472 /* Gateway is different ... */
1473 rt->rt_gateway = new_gw;
1474
1475 /* Redirect received -> path was valid */
d8d1f30b 1476 dst_confirm(&rth->dst);
1da177e4
LT
1477
1478 if (rt->peer)
1479 atomic_inc(&rt->peer->refcnt);
62fa8a84
DM
1480 if (rt->fi)
1481 atomic_inc(&rt->fi->fib_clntref);
1da177e4 1482
d8d1f30b
CG
1483 if (arp_bind_neighbour(&rt->dst) ||
1484 !(rt->dst.neighbour->nud_state &
1da177e4 1485 NUD_VALID)) {
d8d1f30b
CG
1486 if (rt->dst.neighbour)
1487 neigh_event_send(rt->dst.neighbour, NULL);
1da177e4
LT
1488 ip_rt_put(rth);
1489 rt_drop(rt);
1490 goto do_next;
1491 }
e905a9ed 1492
d8d1f30b
CG
1493 netevent.old = &rth->dst;
1494 netevent.new = &rt->dst;
e905a9ed
YH
1495 call_netevent_notifiers(NETEVENT_REDIRECT,
1496 &netevent);
1da177e4
LT
1497
1498 rt_del(hash, rth);
6a2bad70 1499 if (!rt_intern_hash(hash, rt, &rt, NULL, rt->fl.oif))
1da177e4
LT
1500 ip_rt_put(rt);
1501 goto do_next;
1502 }
1da177e4
LT
1503 do_next:
1504 ;
1505 }
1506 }
1da177e4
LT
1507 return;
1508
1509reject_redirect:
1510#ifdef CONFIG_IP_ROUTE_VERBOSE
1511 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
673d57e7
HH
1512 printk(KERN_INFO "Redirect from %pI4 on %s about %pI4 ignored.\n"
1513 " Advised path = %pI4 -> %pI4\n",
1514 &old_gw, dev->name, &new_gw,
1515 &saddr, &daddr);
1da177e4 1516#endif
ed7865a4 1517 ;
1da177e4
LT
1518}
1519
1520static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1521{
ee6b9673 1522 struct rtable *rt = (struct rtable *)dst;
1da177e4
LT
1523 struct dst_entry *ret = dst;
1524
1525 if (rt) {
d11a4dc1 1526 if (dst->obsolete > 0) {
1da177e4
LT
1527 ip_rt_put(rt);
1528 ret = NULL;
1529 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
d8d1f30b
CG
1530 (rt->dst.expires &&
1531 time_after_eq(jiffies, rt->dst.expires))) {
8c7bc840 1532 unsigned hash = rt_hash(rt->fl.fl4_dst, rt->fl.fl4_src,
b00180de 1533 rt->fl.oif,
e84f84f2 1534 rt_genid(dev_net(dst->dev)));
1da177e4 1535#if RT_CACHE_DEBUG >= 1
673d57e7
HH
1536 printk(KERN_DEBUG "ipv4_negative_advice: redirect to %pI4/%02x dropped\n",
1537 &rt->rt_dst, rt->fl.fl4_tos);
1da177e4
LT
1538#endif
1539 rt_del(hash, rt);
1540 ret = NULL;
1541 }
1542 }
1543 return ret;
1544}
1545
1546/*
1547 * Algorithm:
1548 * 1. The first ip_rt_redirect_number redirects are sent
1549 * with exponential backoff, then we stop sending them at all,
1550 * assuming that the host ignores our redirects.
1551 * 2. If we did not see packets requiring redirects
1552 * during ip_rt_redirect_silence, we assume that the host
1553 * forgot redirected route and start to send redirects again.
1554 *
1555 * This algorithm is much cheaper and more intelligent than dumb load limiting
1556 * in icmp.c.
1557 *
1558 * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1559 * and "frag. need" (breaks PMTU discovery) in icmp.c.
1560 */
1561
1562void ip_rt_send_redirect(struct sk_buff *skb)
1563{
511c3f92 1564 struct rtable *rt = skb_rtable(skb);
30038fc6
ED
1565 struct in_device *in_dev;
1566 int log_martians;
1da177e4 1567
30038fc6 1568 rcu_read_lock();
d8d1f30b 1569 in_dev = __in_dev_get_rcu(rt->dst.dev);
30038fc6
ED
1570 if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
1571 rcu_read_unlock();
1da177e4 1572 return;
30038fc6
ED
1573 }
1574 log_martians = IN_DEV_LOG_MARTIANS(in_dev);
1575 rcu_read_unlock();
1da177e4
LT
1576
1577 /* No redirected packets during ip_rt_redirect_silence;
1578 * reset the algorithm.
1579 */
d8d1f30b
CG
1580 if (time_after(jiffies, rt->dst.rate_last + ip_rt_redirect_silence))
1581 rt->dst.rate_tokens = 0;
1da177e4
LT
1582
1583 /* Too many ignored redirects; do not send anything
d8d1f30b 1584 * set dst.rate_last to the last seen redirected packet.
1da177e4 1585 */
d8d1f30b
CG
1586 if (rt->dst.rate_tokens >= ip_rt_redirect_number) {
1587 rt->dst.rate_last = jiffies;
30038fc6 1588 return;
1da177e4
LT
1589 }
1590
1591 /* Check for load limit; set rate_last to the latest sent
1592 * redirect.
1593 */
d8d1f30b 1594 if (rt->dst.rate_tokens == 0 ||
14fb8a76 1595 time_after(jiffies,
d8d1f30b
CG
1596 (rt->dst.rate_last +
1597 (ip_rt_redirect_load << rt->dst.rate_tokens)))) {
1da177e4 1598 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
d8d1f30b
CG
1599 rt->dst.rate_last = jiffies;
1600 ++rt->dst.rate_tokens;
1da177e4 1601#ifdef CONFIG_IP_ROUTE_VERBOSE
30038fc6 1602 if (log_martians &&
d8d1f30b 1603 rt->dst.rate_tokens == ip_rt_redirect_number &&
1da177e4 1604 net_ratelimit())
673d57e7
HH
1605 printk(KERN_WARNING "host %pI4/if%d ignores redirects for %pI4 to %pI4.\n",
1606 &rt->rt_src, rt->rt_iif,
1607 &rt->rt_dst, &rt->rt_gateway);
1da177e4
LT
1608#endif
1609 }
1da177e4
LT
1610}
1611
1612static int ip_error(struct sk_buff *skb)
1613{
511c3f92 1614 struct rtable *rt = skb_rtable(skb);
1da177e4
LT
1615 unsigned long now;
1616 int code;
1617
d8d1f30b 1618 switch (rt->dst.error) {
1da177e4
LT
1619 case EINVAL:
1620 default:
1621 goto out;
1622 case EHOSTUNREACH:
1623 code = ICMP_HOST_UNREACH;
1624 break;
1625 case ENETUNREACH:
1626 code = ICMP_NET_UNREACH;
d8d1f30b 1627 IP_INC_STATS_BH(dev_net(rt->dst.dev),
7c73a6fa 1628 IPSTATS_MIB_INNOROUTES);
1da177e4
LT
1629 break;
1630 case EACCES:
1631 code = ICMP_PKT_FILTERED;
1632 break;
1633 }
1634
1635 now = jiffies;
d8d1f30b
CG
1636 rt->dst.rate_tokens += now - rt->dst.rate_last;
1637 if (rt->dst.rate_tokens > ip_rt_error_burst)
1638 rt->dst.rate_tokens = ip_rt_error_burst;
1639 rt->dst.rate_last = now;
1640 if (rt->dst.rate_tokens >= ip_rt_error_cost) {
1641 rt->dst.rate_tokens -= ip_rt_error_cost;
1da177e4
LT
1642 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1643 }
1644
1645out: kfree_skb(skb);
1646 return 0;
e905a9ed 1647}
1da177e4
LT
1648
1649/*
1650 * The last two values are not from the RFC but
1651 * are needed for AMPRnet AX.25 paths.
1652 */
1653
9b5b5cff 1654static const unsigned short mtu_plateau[] =
1da177e4
LT
1655{32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1656
5969f71d 1657static inline unsigned short guess_mtu(unsigned short old_mtu)
1da177e4
LT
1658{
1659 int i;
e905a9ed 1660
1da177e4
LT
1661 for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
1662 if (old_mtu > mtu_plateau[i])
1663 return mtu_plateau[i];
1664 return 68;
1665}
1666
b5921910 1667unsigned short ip_rt_frag_needed(struct net *net, struct iphdr *iph,
0010e465
TT
1668 unsigned short new_mtu,
1669 struct net_device *dev)
1da177e4 1670{
0010e465 1671 int i, k;
1da177e4
LT
1672 unsigned short old_mtu = ntohs(iph->tot_len);
1673 struct rtable *rth;
0010e465 1674 int ikeys[2] = { dev->ifindex, 0 };
e448515c
AV
1675 __be32 skeys[2] = { iph->saddr, 0, };
1676 __be32 daddr = iph->daddr;
1da177e4
LT
1677 unsigned short est_mtu = 0;
1678
0010e465
TT
1679 for (k = 0; k < 2; k++) {
1680 for (i = 0; i < 2; i++) {
b00180de 1681 unsigned hash = rt_hash(daddr, skeys[i], ikeys[k],
e84f84f2 1682 rt_genid(net));
0010e465
TT
1683
1684 rcu_read_lock();
1685 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
d8d1f30b 1686 rth = rcu_dereference(rth->dst.rt_next)) {
1da177e4
LT
1687 unsigned short mtu = new_mtu;
1688
0010e465
TT
1689 if (rth->fl.fl4_dst != daddr ||
1690 rth->fl.fl4_src != skeys[i] ||
1691 rth->rt_dst != daddr ||
1692 rth->rt_src != iph->saddr ||
1693 rth->fl.oif != ikeys[k] ||
c7537967 1694 rt_is_input_route(rth) ||
d8d1f30b
CG
1695 dst_metric_locked(&rth->dst, RTAX_MTU) ||
1696 !net_eq(dev_net(rth->dst.dev), net) ||
6c3b8fc6 1697 rt_is_expired(rth))
0010e465
TT
1698 continue;
1699
1da177e4
LT
1700 if (new_mtu < 68 || new_mtu >= old_mtu) {
1701
1702 /* BSD 4.2 compatibility hack :-( */
1703 if (mtu == 0 &&
d8d1f30b 1704 old_mtu >= dst_mtu(&rth->dst) &&
1da177e4
LT
1705 old_mtu >= 68 + (iph->ihl << 2))
1706 old_mtu -= iph->ihl << 2;
1707
1708 mtu = guess_mtu(old_mtu);
1709 }
d8d1f30b
CG
1710 if (mtu <= dst_mtu(&rth->dst)) {
1711 if (mtu < dst_mtu(&rth->dst)) {
1712 dst_confirm(&rth->dst);
1da177e4 1713 if (mtu < ip_rt_min_pmtu) {
defb3519
DM
1714 u32 lock = dst_metric(&rth->dst,
1715 RTAX_LOCK);
1da177e4 1716 mtu = ip_rt_min_pmtu;
defb3519
DM
1717 lock |= (1 << RTAX_MTU);
1718 dst_metric_set(&rth->dst, RTAX_LOCK,
1719 lock);
1da177e4 1720 }
defb3519 1721 dst_metric_set(&rth->dst, RTAX_MTU, mtu);
d8d1f30b 1722 dst_set_expires(&rth->dst,
1da177e4
LT
1723 ip_rt_mtu_expires);
1724 }
1725 est_mtu = mtu;
1726 }
1727 }
0010e465 1728 rcu_read_unlock();
1da177e4 1729 }
1da177e4
LT
1730 }
1731 return est_mtu ? : new_mtu;
1732}
1733
1734static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1735{
6d273f8d 1736 if (dst_mtu(dst) > mtu && mtu >= 68 &&
1da177e4
LT
1737 !(dst_metric_locked(dst, RTAX_MTU))) {
1738 if (mtu < ip_rt_min_pmtu) {
defb3519 1739 u32 lock = dst_metric(dst, RTAX_LOCK);
1da177e4 1740 mtu = ip_rt_min_pmtu;
defb3519 1741 dst_metric_set(dst, RTAX_LOCK, lock | (1 << RTAX_MTU));
1da177e4 1742 }
defb3519 1743 dst_metric_set(dst, RTAX_MTU, mtu);
1da177e4 1744 dst_set_expires(dst, ip_rt_mtu_expires);
8d71740c 1745 call_netevent_notifiers(NETEVENT_PMTU_UPDATE, dst);
1da177e4
LT
1746 }
1747}
1748
1749static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1750{
d11a4dc1
TT
1751 if (rt_is_expired((struct rtable *)dst))
1752 return NULL;
1753 return dst;
1da177e4
LT
1754}
1755
1756static void ipv4_dst_destroy(struct dst_entry *dst)
1757{
1758 struct rtable *rt = (struct rtable *) dst;
1759 struct inet_peer *peer = rt->peer;
1da177e4 1760
62fa8a84
DM
1761 if (rt->fi) {
1762 fib_info_put(rt->fi);
1763 rt->fi = NULL;
1764 }
1da177e4
LT
1765 if (peer) {
1766 rt->peer = NULL;
1767 inet_putpeer(peer);
1768 }
1da177e4
LT
1769}
1770
1da177e4
LT
1771
1772static void ipv4_link_failure(struct sk_buff *skb)
1773{
1774 struct rtable *rt;
1775
1776 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1777
511c3f92 1778 rt = skb_rtable(skb);
1da177e4 1779 if (rt)
d8d1f30b 1780 dst_set_expires(&rt->dst, 0);
1da177e4
LT
1781}
1782
1783static int ip_rt_bug(struct sk_buff *skb)
1784{
673d57e7
HH
1785 printk(KERN_DEBUG "ip_rt_bug: %pI4 -> %pI4, %s\n",
1786 &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1da177e4
LT
1787 skb->dev ? skb->dev->name : "?");
1788 kfree_skb(skb);
1789 return 0;
1790}
1791
1792/*
1793 We do not cache source address of outgoing interface,
1794 because it is used only by IP RR, TS and SRR options,
1795 so that it out of fast path.
1796
1797 BTW remember: "addr" is allowed to be not aligned
1798 in IP options!
1799 */
1800
1801void ip_rt_get_source(u8 *addr, struct rtable *rt)
1802{
a61ced5d 1803 __be32 src;
1da177e4
LT
1804 struct fib_result res;
1805
c7537967 1806 if (rt_is_output_route(rt))
1da177e4 1807 src = rt->rt_src;
ebc0ffae
ED
1808 else {
1809 rcu_read_lock();
1810 if (fib_lookup(dev_net(rt->dst.dev), &rt->fl, &res) == 0)
1811 src = FIB_RES_PREFSRC(res);
1812 else
1813 src = inet_select_addr(rt->dst.dev, rt->rt_gateway,
1da177e4 1814 RT_SCOPE_UNIVERSE);
ebc0ffae
ED
1815 rcu_read_unlock();
1816 }
1da177e4
LT
1817 memcpy(addr, &src, 4);
1818}
1819
c7066f70 1820#ifdef CONFIG_IP_ROUTE_CLASSID
1da177e4
LT
1821static void set_class_tag(struct rtable *rt, u32 tag)
1822{
d8d1f30b
CG
1823 if (!(rt->dst.tclassid & 0xFFFF))
1824 rt->dst.tclassid |= tag & 0xFFFF;
1825 if (!(rt->dst.tclassid & 0xFFFF0000))
1826 rt->dst.tclassid |= tag & 0xFFFF0000;
1da177e4
LT
1827}
1828#endif
1829
0dbaee3b
DM
1830static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1831{
1832 unsigned int advmss = dst_metric_raw(dst, RTAX_ADVMSS);
1833
1834 if (advmss == 0) {
1835 advmss = max_t(unsigned int, dst->dev->mtu - 40,
1836 ip_rt_min_advmss);
1837 if (advmss > 65535 - 40)
1838 advmss = 65535 - 40;
1839 }
1840 return advmss;
1841}
1842
d33e4553
DM
1843static unsigned int ipv4_default_mtu(const struct dst_entry *dst)
1844{
1845 unsigned int mtu = dst->dev->mtu;
1846
1847 if (unlikely(dst_metric_locked(dst, RTAX_MTU))) {
1848 const struct rtable *rt = (const struct rtable *) dst;
1849
1850 if (rt->rt_gateway != rt->rt_dst && mtu > 576)
1851 mtu = 576;
1852 }
1853
1854 if (mtu > IP_MAX_MTU)
1855 mtu = IP_MAX_MTU;
1856
1857 return mtu;
1858}
1859
a4daad6b
DM
1860static void rt_init_metrics(struct rtable *rt, struct fib_info *fi)
1861{
1862 if (!(rt->fl.flags & FLOWI_FLAG_PRECOW_METRICS)) {
1863 no_cow:
b8dad61c
DM
1864 if (fi->fib_metrics != (u32 *) dst_default_metrics) {
1865 rt->fi = fi;
1866 atomic_inc(&fi->fib_clntref);
1867 }
a4daad6b
DM
1868 dst_init_metrics(&rt->dst, fi->fib_metrics, true);
1869 } else {
1870 struct inet_peer *peer;
1871
1872 if (!rt->peer)
1873 rt_bind_peer(rt, 1);
1874 peer = rt->peer;
1875 if (!peer)
1876 goto no_cow;
1877 if (inet_metrics_new(peer))
1878 memcpy(peer->metrics, fi->fib_metrics,
1879 sizeof(u32) * RTAX_MAX);
1880 dst_init_metrics(&rt->dst, peer->metrics, false);
1881 }
1882}
1883
1da177e4
LT
1884static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag)
1885{
defb3519 1886 struct dst_entry *dst = &rt->dst;
1da177e4
LT
1887 struct fib_info *fi = res->fi;
1888
1889 if (fi) {
1890 if (FIB_RES_GW(*res) &&
1891 FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1892 rt->rt_gateway = FIB_RES_GW(*res);
a4daad6b 1893 rt_init_metrics(rt, fi);
c7066f70 1894#ifdef CONFIG_IP_ROUTE_CLASSID
defb3519 1895 dst->tclassid = FIB_RES_NH(*res).nh_tclassid;
1da177e4 1896#endif
d33e4553 1897 }
defb3519 1898
defb3519
DM
1899 if (dst_mtu(dst) > IP_MAX_MTU)
1900 dst_metric_set(dst, RTAX_MTU, IP_MAX_MTU);
0dbaee3b 1901 if (dst_metric_raw(dst, RTAX_ADVMSS) > 65535 - 40)
defb3519 1902 dst_metric_set(dst, RTAX_ADVMSS, 65535 - 40);
1da177e4 1903
c7066f70 1904#ifdef CONFIG_IP_ROUTE_CLASSID
1da177e4
LT
1905#ifdef CONFIG_IP_MULTIPLE_TABLES
1906 set_class_tag(rt, fib_rules_tclass(res));
1907#endif
1908 set_class_tag(rt, itag);
1909#endif
e905a9ed 1910 rt->rt_type = res->type;
1da177e4
LT
1911}
1912
96d36220 1913/* called in rcu_read_lock() section */
9e12bb22 1914static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1da177e4
LT
1915 u8 tos, struct net_device *dev, int our)
1916{
96d36220 1917 unsigned int hash;
1da177e4 1918 struct rtable *rth;
a61ced5d 1919 __be32 spec_dst;
96d36220 1920 struct in_device *in_dev = __in_dev_get_rcu(dev);
1da177e4 1921 u32 itag = 0;
b5f7e755 1922 int err;
1da177e4
LT
1923
1924 /* Primary sanity checks. */
1925
1926 if (in_dev == NULL)
1927 return -EINVAL;
1928
1e637c74 1929 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
f97c1e0c 1930 ipv4_is_loopback(saddr) || skb->protocol != htons(ETH_P_IP))
1da177e4
LT
1931 goto e_inval;
1932
f97c1e0c
JP
1933 if (ipv4_is_zeronet(saddr)) {
1934 if (!ipv4_is_local_multicast(daddr))
1da177e4
LT
1935 goto e_inval;
1936 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
b5f7e755
ED
1937 } else {
1938 err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst,
1939 &itag, 0);
1940 if (err < 0)
1941 goto e_err;
1942 }
1da177e4
LT
1943 rth = dst_alloc(&ipv4_dst_ops);
1944 if (!rth)
1945 goto e_nobufs;
1946
d8d1f30b
CG
1947 rth->dst.output = ip_rt_bug;
1948 rth->dst.obsolete = -1;
1da177e4 1949
d8d1f30b
CG
1950 atomic_set(&rth->dst.__refcnt, 1);
1951 rth->dst.flags= DST_HOST;
42f811b8 1952 if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
d8d1f30b 1953 rth->dst.flags |= DST_NOPOLICY;
1da177e4
LT
1954 rth->fl.fl4_dst = daddr;
1955 rth->rt_dst = daddr;
1956 rth->fl.fl4_tos = tos;
47dcf0cb 1957 rth->fl.mark = skb->mark;
1da177e4
LT
1958 rth->fl.fl4_src = saddr;
1959 rth->rt_src = saddr;
c7066f70 1960#ifdef CONFIG_IP_ROUTE_CLASSID
d8d1f30b 1961 rth->dst.tclassid = itag;
1da177e4
LT
1962#endif
1963 rth->rt_iif =
1964 rth->fl.iif = dev->ifindex;
d8d1f30b
CG
1965 rth->dst.dev = init_net.loopback_dev;
1966 dev_hold(rth->dst.dev);
1da177e4
LT
1967 rth->fl.oif = 0;
1968 rth->rt_gateway = daddr;
1969 rth->rt_spec_dst= spec_dst;
e84f84f2 1970 rth->rt_genid = rt_genid(dev_net(dev));
1da177e4 1971 rth->rt_flags = RTCF_MULTICAST;
29e75252 1972 rth->rt_type = RTN_MULTICAST;
1da177e4 1973 if (our) {
d8d1f30b 1974 rth->dst.input= ip_local_deliver;
1da177e4
LT
1975 rth->rt_flags |= RTCF_LOCAL;
1976 }
1977
1978#ifdef CONFIG_IP_MROUTE
f97c1e0c 1979 if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
d8d1f30b 1980 rth->dst.input = ip_mr_input;
1da177e4
LT
1981#endif
1982 RT_CACHE_STAT_INC(in_slow_mc);
1983
e84f84f2 1984 hash = rt_hash(daddr, saddr, dev->ifindex, rt_genid(dev_net(dev)));
6a2bad70 1985 return rt_intern_hash(hash, rth, NULL, skb, dev->ifindex);
1da177e4
LT
1986
1987e_nobufs:
1da177e4 1988 return -ENOBUFS;
1da177e4 1989e_inval:
96d36220 1990 return -EINVAL;
b5f7e755 1991e_err:
b5f7e755 1992 return err;
1da177e4
LT
1993}
1994
1995
1996static void ip_handle_martian_source(struct net_device *dev,
1997 struct in_device *in_dev,
1998 struct sk_buff *skb,
9e12bb22
AV
1999 __be32 daddr,
2000 __be32 saddr)
1da177e4
LT
2001{
2002 RT_CACHE_STAT_INC(in_martian_src);
2003#ifdef CONFIG_IP_ROUTE_VERBOSE
2004 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
2005 /*
2006 * RFC1812 recommendation, if source is martian,
2007 * the only hint is MAC header.
2008 */
673d57e7
HH
2009 printk(KERN_WARNING "martian source %pI4 from %pI4, on dev %s\n",
2010 &daddr, &saddr, dev->name);
98e399f8 2011 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1da177e4 2012 int i;
98e399f8 2013 const unsigned char *p = skb_mac_header(skb);
1da177e4
LT
2014 printk(KERN_WARNING "ll header: ");
2015 for (i = 0; i < dev->hard_header_len; i++, p++) {
2016 printk("%02x", *p);
2017 if (i < (dev->hard_header_len - 1))
2018 printk(":");
2019 }
2020 printk("\n");
2021 }
2022 }
2023#endif
2024}
2025
47360228 2026/* called in rcu_read_lock() section */
5969f71d
SH
2027static int __mkroute_input(struct sk_buff *skb,
2028 struct fib_result *res,
2029 struct in_device *in_dev,
2030 __be32 daddr, __be32 saddr, u32 tos,
2031 struct rtable **result)
1da177e4 2032{
1da177e4
LT
2033 struct rtable *rth;
2034 int err;
2035 struct in_device *out_dev;
47360228 2036 unsigned int flags = 0;
d9c9df8c
AV
2037 __be32 spec_dst;
2038 u32 itag;
1da177e4
LT
2039
2040 /* get a working reference to the output device */
47360228 2041 out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
1da177e4
LT
2042 if (out_dev == NULL) {
2043 if (net_ratelimit())
2044 printk(KERN_CRIT "Bug in ip_route_input" \
2045 "_slow(). Please, report\n");
2046 return -EINVAL;
2047 }
2048
2049
e905a9ed 2050 err = fib_validate_source(saddr, daddr, tos, FIB_RES_OIF(*res),
b0c110ca 2051 in_dev->dev, &spec_dst, &itag, skb->mark);
1da177e4 2052 if (err < 0) {
e905a9ed 2053 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1da177e4 2054 saddr);
e905a9ed 2055
1da177e4
LT
2056 goto cleanup;
2057 }
2058
2059 if (err)
2060 flags |= RTCF_DIRECTSRC;
2061
51b77cae 2062 if (out_dev == in_dev && err &&
1da177e4
LT
2063 (IN_DEV_SHARED_MEDIA(out_dev) ||
2064 inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
2065 flags |= RTCF_DOREDIRECT;
2066
2067 if (skb->protocol != htons(ETH_P_IP)) {
2068 /* Not IP (i.e. ARP). Do not create route, if it is
2069 * invalid for proxy arp. DNAT routes are always valid.
65324144
JDB
2070 *
2071 * Proxy arp feature have been extended to allow, ARP
2072 * replies back to the same interface, to support
2073 * Private VLAN switch technologies. See arp.c.
1da177e4 2074 */
65324144
JDB
2075 if (out_dev == in_dev &&
2076 IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
1da177e4
LT
2077 err = -EINVAL;
2078 goto cleanup;
2079 }
2080 }
2081
2082
2083 rth = dst_alloc(&ipv4_dst_ops);
2084 if (!rth) {
2085 err = -ENOBUFS;
2086 goto cleanup;
2087 }
2088
d8d1f30b
CG
2089 atomic_set(&rth->dst.__refcnt, 1);
2090 rth->dst.flags= DST_HOST;
42f811b8 2091 if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
d8d1f30b 2092 rth->dst.flags |= DST_NOPOLICY;
42f811b8 2093 if (IN_DEV_CONF_GET(out_dev, NOXFRM))
d8d1f30b 2094 rth->dst.flags |= DST_NOXFRM;
1da177e4
LT
2095 rth->fl.fl4_dst = daddr;
2096 rth->rt_dst = daddr;
2097 rth->fl.fl4_tos = tos;
47dcf0cb 2098 rth->fl.mark = skb->mark;
1da177e4
LT
2099 rth->fl.fl4_src = saddr;
2100 rth->rt_src = saddr;
2101 rth->rt_gateway = daddr;
2102 rth->rt_iif =
2103 rth->fl.iif = in_dev->dev->ifindex;
d8d1f30b
CG
2104 rth->dst.dev = (out_dev)->dev;
2105 dev_hold(rth->dst.dev);
1da177e4
LT
2106 rth->fl.oif = 0;
2107 rth->rt_spec_dst= spec_dst;
2108
d8d1f30b
CG
2109 rth->dst.obsolete = -1;
2110 rth->dst.input = ip_forward;
2111 rth->dst.output = ip_output;
2112 rth->rt_genid = rt_genid(dev_net(rth->dst.dev));
1da177e4
LT
2113
2114 rt_set_nexthop(rth, res, itag);
2115
2116 rth->rt_flags = flags;
2117
2118 *result = rth;
2119 err = 0;
2120 cleanup:
1da177e4 2121 return err;
e905a9ed 2122}
1da177e4 2123
5969f71d
SH
2124static int ip_mkroute_input(struct sk_buff *skb,
2125 struct fib_result *res,
2126 const struct flowi *fl,
2127 struct in_device *in_dev,
2128 __be32 daddr, __be32 saddr, u32 tos)
1da177e4 2129{
7abaa27c 2130 struct rtable* rth = NULL;
1da177e4
LT
2131 int err;
2132 unsigned hash;
2133
2134#ifdef CONFIG_IP_ROUTE_MULTIPATH
2135 if (res->fi && res->fi->fib_nhs > 1 && fl->oif == 0)
2136 fib_select_multipath(fl, res);
2137#endif
2138
2139 /* create a routing cache entry */
2140 err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
2141 if (err)
2142 return err;
1da177e4
LT
2143
2144 /* put it into the cache */
e84f84f2 2145 hash = rt_hash(daddr, saddr, fl->iif,
d8d1f30b 2146 rt_genid(dev_net(rth->dst.dev)));
6a2bad70 2147 return rt_intern_hash(hash, rth, NULL, skb, fl->iif);
1da177e4
LT
2148}
2149
1da177e4
LT
2150/*
2151 * NOTE. We drop all the packets that has local source
2152 * addresses, because every properly looped back packet
2153 * must have correct destination already attached by output routine.
2154 *
2155 * Such approach solves two big problems:
2156 * 1. Not simplex devices are handled properly.
2157 * 2. IP spoofing attempts are filtered with 100% of guarantee.
ebc0ffae 2158 * called with rcu_read_lock()
1da177e4
LT
2159 */
2160
9e12bb22 2161static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1da177e4
LT
2162 u8 tos, struct net_device *dev)
2163{
2164 struct fib_result res;
96d36220 2165 struct in_device *in_dev = __in_dev_get_rcu(dev);
5811662b
CG
2166 struct flowi fl = { .fl4_dst = daddr,
2167 .fl4_src = saddr,
2168 .fl4_tos = tos,
2169 .fl4_scope = RT_SCOPE_UNIVERSE,
47dcf0cb 2170 .mark = skb->mark,
1da177e4
LT
2171 .iif = dev->ifindex };
2172 unsigned flags = 0;
2173 u32 itag = 0;
2174 struct rtable * rth;
2175 unsigned hash;
9e12bb22 2176 __be32 spec_dst;
1da177e4 2177 int err = -EINVAL;
c346dca1 2178 struct net * net = dev_net(dev);
1da177e4
LT
2179
2180 /* IP on this device is disabled. */
2181
2182 if (!in_dev)
2183 goto out;
2184
2185 /* Check for the most weird martians, which can be not detected
2186 by fib_lookup.
2187 */
2188
1e637c74 2189 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
f97c1e0c 2190 ipv4_is_loopback(saddr))
1da177e4
LT
2191 goto martian_source;
2192
27a954bd 2193 if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
1da177e4
LT
2194 goto brd_input;
2195
2196 /* Accept zero addresses only to limited broadcast;
2197 * I even do not know to fix it or not. Waiting for complains :-)
2198 */
f97c1e0c 2199 if (ipv4_is_zeronet(saddr))
1da177e4
LT
2200 goto martian_source;
2201
27a954bd 2202 if (ipv4_is_zeronet(daddr) || ipv4_is_loopback(daddr))
1da177e4
LT
2203 goto martian_destination;
2204
2205 /*
2206 * Now we are ready to route packet.
2207 */
ebc0ffae
ED
2208 err = fib_lookup(net, &fl, &res);
2209 if (err != 0) {
1da177e4 2210 if (!IN_DEV_FORWARD(in_dev))
2c2910a4 2211 goto e_hostunreach;
1da177e4
LT
2212 goto no_route;
2213 }
1da177e4
LT
2214
2215 RT_CACHE_STAT_INC(in_slow_tot);
2216
2217 if (res.type == RTN_BROADCAST)
2218 goto brd_input;
2219
2220 if (res.type == RTN_LOCAL) {
b5f7e755 2221 err = fib_validate_source(saddr, daddr, tos,
ebc0ffae
ED
2222 net->loopback_dev->ifindex,
2223 dev, &spec_dst, &itag, skb->mark);
b5f7e755
ED
2224 if (err < 0)
2225 goto martian_source_keep_err;
2226 if (err)
1da177e4
LT
2227 flags |= RTCF_DIRECTSRC;
2228 spec_dst = daddr;
2229 goto local_input;
2230 }
2231
2232 if (!IN_DEV_FORWARD(in_dev))
2c2910a4 2233 goto e_hostunreach;
1da177e4
LT
2234 if (res.type != RTN_UNICAST)
2235 goto martian_destination;
2236
2237 err = ip_mkroute_input(skb, &res, &fl, in_dev, daddr, saddr, tos);
1da177e4
LT
2238out: return err;
2239
2240brd_input:
2241 if (skb->protocol != htons(ETH_P_IP))
2242 goto e_inval;
2243
f97c1e0c 2244 if (ipv4_is_zeronet(saddr))
1da177e4
LT
2245 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
2246 else {
2247 err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst,
b0c110ca 2248 &itag, skb->mark);
1da177e4 2249 if (err < 0)
b5f7e755 2250 goto martian_source_keep_err;
1da177e4
LT
2251 if (err)
2252 flags |= RTCF_DIRECTSRC;
2253 }
2254 flags |= RTCF_BROADCAST;
2255 res.type = RTN_BROADCAST;
2256 RT_CACHE_STAT_INC(in_brd);
2257
2258local_input:
2259 rth = dst_alloc(&ipv4_dst_ops);
2260 if (!rth)
2261 goto e_nobufs;
2262
d8d1f30b
CG
2263 rth->dst.output= ip_rt_bug;
2264 rth->dst.obsolete = -1;
e84f84f2 2265 rth->rt_genid = rt_genid(net);
1da177e4 2266
d8d1f30b
CG
2267 atomic_set(&rth->dst.__refcnt, 1);
2268 rth->dst.flags= DST_HOST;
42f811b8 2269 if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
d8d1f30b 2270 rth->dst.flags |= DST_NOPOLICY;
1da177e4
LT
2271 rth->fl.fl4_dst = daddr;
2272 rth->rt_dst = daddr;
2273 rth->fl.fl4_tos = tos;
47dcf0cb 2274 rth->fl.mark = skb->mark;
1da177e4
LT
2275 rth->fl.fl4_src = saddr;
2276 rth->rt_src = saddr;
c7066f70 2277#ifdef CONFIG_IP_ROUTE_CLASSID
d8d1f30b 2278 rth->dst.tclassid = itag;
1da177e4
LT
2279#endif
2280 rth->rt_iif =
2281 rth->fl.iif = dev->ifindex;
d8d1f30b
CG
2282 rth->dst.dev = net->loopback_dev;
2283 dev_hold(rth->dst.dev);
1da177e4
LT
2284 rth->rt_gateway = daddr;
2285 rth->rt_spec_dst= spec_dst;
d8d1f30b 2286 rth->dst.input= ip_local_deliver;
1da177e4
LT
2287 rth->rt_flags = flags|RTCF_LOCAL;
2288 if (res.type == RTN_UNREACHABLE) {
d8d1f30b
CG
2289 rth->dst.input= ip_error;
2290 rth->dst.error= -err;
1da177e4
LT
2291 rth->rt_flags &= ~RTCF_LOCAL;
2292 }
2293 rth->rt_type = res.type;
e84f84f2 2294 hash = rt_hash(daddr, saddr, fl.iif, rt_genid(net));
6a2bad70 2295 err = rt_intern_hash(hash, rth, NULL, skb, fl.iif);
ebc0ffae 2296 goto out;
1da177e4
LT
2297
2298no_route:
2299 RT_CACHE_STAT_INC(in_no_route);
2300 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
2301 res.type = RTN_UNREACHABLE;
7f53878d
MC
2302 if (err == -ESRCH)
2303 err = -ENETUNREACH;
1da177e4
LT
2304 goto local_input;
2305
2306 /*
2307 * Do not cache martian addresses: they should be logged (RFC1812)
2308 */
2309martian_destination:
2310 RT_CACHE_STAT_INC(in_martian_dst);
2311#ifdef CONFIG_IP_ROUTE_VERBOSE
2312 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
673d57e7
HH
2313 printk(KERN_WARNING "martian destination %pI4 from %pI4, dev %s\n",
2314 &daddr, &saddr, dev->name);
1da177e4 2315#endif
2c2910a4
DE
2316
2317e_hostunreach:
e905a9ed 2318 err = -EHOSTUNREACH;
ebc0ffae 2319 goto out;
2c2910a4 2320
1da177e4
LT
2321e_inval:
2322 err = -EINVAL;
ebc0ffae 2323 goto out;
1da177e4
LT
2324
2325e_nobufs:
2326 err = -ENOBUFS;
ebc0ffae 2327 goto out;
1da177e4
LT
2328
2329martian_source:
b5f7e755
ED
2330 err = -EINVAL;
2331martian_source_keep_err:
1da177e4 2332 ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
ebc0ffae 2333 goto out;
1da177e4
LT
2334}
2335
407eadd9
ED
2336int ip_route_input_common(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2337 u8 tos, struct net_device *dev, bool noref)
1da177e4
LT
2338{
2339 struct rtable * rth;
2340 unsigned hash;
2341 int iif = dev->ifindex;
b5921910 2342 struct net *net;
96d36220 2343 int res;
1da177e4 2344
c346dca1 2345 net = dev_net(dev);
1080d709 2346
96d36220
ED
2347 rcu_read_lock();
2348
1080d709
NH
2349 if (!rt_caching(net))
2350 goto skip_cache;
2351
1da177e4 2352 tos &= IPTOS_RT_MASK;
e84f84f2 2353 hash = rt_hash(daddr, saddr, iif, rt_genid(net));
1da177e4 2354
1da177e4 2355 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
d8d1f30b 2356 rth = rcu_dereference(rth->dst.rt_next)) {
0eae88f3
ED
2357 if ((((__force u32)rth->fl.fl4_dst ^ (__force u32)daddr) |
2358 ((__force u32)rth->fl.fl4_src ^ (__force u32)saddr) |
c0b8c32b
SH
2359 (rth->fl.iif ^ iif) |
2360 rth->fl.oif |
2361 (rth->fl.fl4_tos ^ tos)) == 0 &&
47dcf0cb 2362 rth->fl.mark == skb->mark &&
d8d1f30b 2363 net_eq(dev_net(rth->dst.dev), net) &&
e84f84f2 2364 !rt_is_expired(rth)) {
407eadd9 2365 if (noref) {
d8d1f30b
CG
2366 dst_use_noref(&rth->dst, jiffies);
2367 skb_dst_set_noref(skb, &rth->dst);
407eadd9 2368 } else {
d8d1f30b
CG
2369 dst_use(&rth->dst, jiffies);
2370 skb_dst_set(skb, &rth->dst);
407eadd9 2371 }
1da177e4
LT
2372 RT_CACHE_STAT_INC(in_hit);
2373 rcu_read_unlock();
1da177e4
LT
2374 return 0;
2375 }
2376 RT_CACHE_STAT_INC(in_hlist_search);
2377 }
1da177e4 2378
1080d709 2379skip_cache:
1da177e4
LT
2380 /* Multicast recognition logic is moved from route cache to here.
2381 The problem was that too many Ethernet cards have broken/missing
2382 hardware multicast filters :-( As result the host on multicasting
2383 network acquires a lot of useless route cache entries, sort of
2384 SDR messages from all the world. Now we try to get rid of them.
2385 Really, provided software IP multicast filter is organized
2386 reasonably (at least, hashed), it does not result in a slowdown
2387 comparing with route cache reject entries.
2388 Note, that multicast routers are not affected, because
2389 route cache entry is created eventually.
2390 */
f97c1e0c 2391 if (ipv4_is_multicast(daddr)) {
96d36220 2392 struct in_device *in_dev = __in_dev_get_rcu(dev);
1da177e4 2393
96d36220 2394 if (in_dev) {
1da177e4 2395 int our = ip_check_mc(in_dev, daddr, saddr,
96d36220 2396 ip_hdr(skb)->protocol);
1da177e4
LT
2397 if (our
2398#ifdef CONFIG_IP_MROUTE
9d4fb27d
JP
2399 ||
2400 (!ipv4_is_local_multicast(daddr) &&
2401 IN_DEV_MFORWARD(in_dev))
1da177e4 2402#endif
9d4fb27d 2403 ) {
96d36220
ED
2404 int res = ip_route_input_mc(skb, daddr, saddr,
2405 tos, dev, our);
1da177e4 2406 rcu_read_unlock();
96d36220 2407 return res;
1da177e4
LT
2408 }
2409 }
2410 rcu_read_unlock();
2411 return -EINVAL;
2412 }
96d36220
ED
2413 res = ip_route_input_slow(skb, daddr, saddr, tos, dev);
2414 rcu_read_unlock();
2415 return res;
1da177e4 2416}
407eadd9 2417EXPORT_SYMBOL(ip_route_input_common);
1da177e4 2418
ebc0ffae 2419/* called with rcu_read_lock() */
5969f71d
SH
2420static int __mkroute_output(struct rtable **result,
2421 struct fib_result *res,
2422 const struct flowi *fl,
2423 const struct flowi *oldflp,
2424 struct net_device *dev_out,
2425 unsigned flags)
1da177e4
LT
2426{
2427 struct rtable *rth;
2428 struct in_device *in_dev;
2429 u32 tos = RT_FL_TOS(oldflp);
1da177e4 2430
dd28d1a0 2431 if (ipv4_is_loopback(fl->fl4_src) && !(dev_out->flags & IFF_LOOPBACK))
1da177e4
LT
2432 return -EINVAL;
2433
27a954bd 2434 if (ipv4_is_lbcast(fl->fl4_dst))
1da177e4 2435 res->type = RTN_BROADCAST;
f97c1e0c 2436 else if (ipv4_is_multicast(fl->fl4_dst))
1da177e4 2437 res->type = RTN_MULTICAST;
27a954bd 2438 else if (ipv4_is_zeronet(fl->fl4_dst))
1da177e4
LT
2439 return -EINVAL;
2440
2441 if (dev_out->flags & IFF_LOOPBACK)
2442 flags |= RTCF_LOCAL;
2443
dd28d1a0 2444 in_dev = __in_dev_get_rcu(dev_out);
ebc0ffae 2445 if (!in_dev)
1da177e4 2446 return -EINVAL;
ebc0ffae 2447
1da177e4
LT
2448 if (res->type == RTN_BROADCAST) {
2449 flags |= RTCF_BROADCAST | RTCF_LOCAL;
ebc0ffae 2450 res->fi = NULL;
1da177e4 2451 } else if (res->type == RTN_MULTICAST) {
dd28d1a0 2452 flags |= RTCF_MULTICAST | RTCF_LOCAL;
e905a9ed 2453 if (!ip_check_mc(in_dev, oldflp->fl4_dst, oldflp->fl4_src,
1da177e4
LT
2454 oldflp->proto))
2455 flags &= ~RTCF_LOCAL;
2456 /* If multicast route do not exist use
dd28d1a0
ED
2457 * default one, but do not gateway in this case.
2458 * Yes, it is hack.
1da177e4 2459 */
ebc0ffae 2460 if (res->fi && res->prefixlen < 4)
1da177e4 2461 res->fi = NULL;
1da177e4
LT
2462 }
2463
2464
2465 rth = dst_alloc(&ipv4_dst_ops);
8391d07b 2466 if (!rth)
dd28d1a0 2467 return -ENOBUFS;
8391d07b 2468
d8d1f30b
CG
2469 atomic_set(&rth->dst.__refcnt, 1);
2470 rth->dst.flags= DST_HOST;
42f811b8 2471 if (IN_DEV_CONF_GET(in_dev, NOXFRM))
d8d1f30b 2472 rth->dst.flags |= DST_NOXFRM;
42f811b8 2473 if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
d8d1f30b 2474 rth->dst.flags |= DST_NOPOLICY;
1da177e4
LT
2475
2476 rth->fl.fl4_dst = oldflp->fl4_dst;
2477 rth->fl.fl4_tos = tos;
2478 rth->fl.fl4_src = oldflp->fl4_src;
2479 rth->fl.oif = oldflp->oif;
47dcf0cb 2480 rth->fl.mark = oldflp->mark;
1da177e4
LT
2481 rth->rt_dst = fl->fl4_dst;
2482 rth->rt_src = fl->fl4_src;
2483 rth->rt_iif = oldflp->oif ? : dev_out->ifindex;
e905a9ed 2484 /* get references to the devices that are to be hold by the routing
1da177e4 2485 cache entry */
d8d1f30b 2486 rth->dst.dev = dev_out;
1da177e4 2487 dev_hold(dev_out);
1da177e4
LT
2488 rth->rt_gateway = fl->fl4_dst;
2489 rth->rt_spec_dst= fl->fl4_src;
2490
d8d1f30b
CG
2491 rth->dst.output=ip_output;
2492 rth->dst.obsolete = -1;
e84f84f2 2493 rth->rt_genid = rt_genid(dev_net(dev_out));
1da177e4
LT
2494
2495 RT_CACHE_STAT_INC(out_slow_tot);
2496
2497 if (flags & RTCF_LOCAL) {
d8d1f30b 2498 rth->dst.input = ip_local_deliver;
1da177e4
LT
2499 rth->rt_spec_dst = fl->fl4_dst;
2500 }
2501 if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2502 rth->rt_spec_dst = fl->fl4_src;
e905a9ed 2503 if (flags & RTCF_LOCAL &&
1da177e4 2504 !(dev_out->flags & IFF_LOOPBACK)) {
d8d1f30b 2505 rth->dst.output = ip_mc_output;
1da177e4
LT
2506 RT_CACHE_STAT_INC(out_slow_mc);
2507 }
2508#ifdef CONFIG_IP_MROUTE
2509 if (res->type == RTN_MULTICAST) {
2510 if (IN_DEV_MFORWARD(in_dev) &&
f97c1e0c 2511 !ipv4_is_local_multicast(oldflp->fl4_dst)) {
d8d1f30b
CG
2512 rth->dst.input = ip_mr_input;
2513 rth->dst.output = ip_mc_output;
1da177e4
LT
2514 }
2515 }
2516#endif
2517 }
2518
2519 rt_set_nexthop(rth, res, 0);
2520
2521 rth->rt_flags = flags;
1da177e4 2522 *result = rth;
dd28d1a0 2523 return 0;
1da177e4
LT
2524}
2525
ebc0ffae 2526/* called with rcu_read_lock() */
5969f71d
SH
2527static int ip_mkroute_output(struct rtable **rp,
2528 struct fib_result *res,
2529 const struct flowi *fl,
2530 const struct flowi *oldflp,
2531 struct net_device *dev_out,
2532 unsigned flags)
1da177e4 2533{
7abaa27c 2534 struct rtable *rth = NULL;
1da177e4
LT
2535 int err = __mkroute_output(&rth, res, fl, oldflp, dev_out, flags);
2536 unsigned hash;
2537 if (err == 0) {
b00180de 2538 hash = rt_hash(oldflp->fl4_dst, oldflp->fl4_src, oldflp->oif,
e84f84f2 2539 rt_genid(dev_net(dev_out)));
6a2bad70 2540 err = rt_intern_hash(hash, rth, rp, NULL, oldflp->oif);
1da177e4 2541 }
e905a9ed 2542
1da177e4
LT
2543 return err;
2544}
2545
1da177e4
LT
2546/*
2547 * Major route resolver routine.
0197aa38 2548 * called with rcu_read_lock();
1da177e4
LT
2549 */
2550
b40afd0e
DL
2551static int ip_route_output_slow(struct net *net, struct rtable **rp,
2552 const struct flowi *oldflp)
1da177e4
LT
2553{
2554 u32 tos = RT_FL_TOS(oldflp);
5811662b
CG
2555 struct flowi fl = { .fl4_dst = oldflp->fl4_dst,
2556 .fl4_src = oldflp->fl4_src,
2557 .fl4_tos = tos & IPTOS_RT_MASK,
2558 .fl4_scope = ((tos & RTO_ONLINK) ?
2559 RT_SCOPE_LINK : RT_SCOPE_UNIVERSE),
47dcf0cb 2560 .mark = oldflp->mark,
b40afd0e 2561 .iif = net->loopback_dev->ifindex,
1da177e4
LT
2562 .oif = oldflp->oif };
2563 struct fib_result res;
0197aa38 2564 unsigned int flags = 0;
1da177e4 2565 struct net_device *dev_out = NULL;
1da177e4
LT
2566 int err;
2567
2568
2569 res.fi = NULL;
2570#ifdef CONFIG_IP_MULTIPLE_TABLES
2571 res.r = NULL;
2572#endif
2573
2574 if (oldflp->fl4_src) {
2575 err = -EINVAL;
f97c1e0c 2576 if (ipv4_is_multicast(oldflp->fl4_src) ||
1e637c74 2577 ipv4_is_lbcast(oldflp->fl4_src) ||
f97c1e0c 2578 ipv4_is_zeronet(oldflp->fl4_src))
1da177e4
LT
2579 goto out;
2580
1da177e4
LT
2581 /* I removed check for oif == dev_out->oif here.
2582 It was wrong for two reasons:
1ab35276
DL
2583 1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2584 is assigned to multiple interfaces.
1da177e4
LT
2585 2. Moreover, we are allowed to send packets with saddr
2586 of another iface. --ANK
2587 */
2588
9d4fb27d
JP
2589 if (oldflp->oif == 0 &&
2590 (ipv4_is_multicast(oldflp->fl4_dst) ||
27a954bd 2591 ipv4_is_lbcast(oldflp->fl4_dst))) {
a210d01a 2592 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
0197aa38 2593 dev_out = __ip_dev_find(net, oldflp->fl4_src, false);
a210d01a
JA
2594 if (dev_out == NULL)
2595 goto out;
2596
1da177e4
LT
2597 /* Special hack: user can direct multicasts
2598 and limited broadcast via necessary interface
2599 without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2600 This hack is not just for fun, it allows
2601 vic,vat and friends to work.
2602 They bind socket to loopback, set ttl to zero
2603 and expect that it will work.
2604 From the viewpoint of routing cache they are broken,
2605 because we are not allowed to build multicast path
2606 with loopback source addr (look, routing cache
2607 cannot know, that ttl is zero, so that packet
2608 will not leave this host and route is valid).
2609 Luckily, this hack is good workaround.
2610 */
2611
2612 fl.oif = dev_out->ifindex;
2613 goto make_route;
2614 }
a210d01a
JA
2615
2616 if (!(oldflp->flags & FLOWI_FLAG_ANYSRC)) {
2617 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
0197aa38 2618 if (!__ip_dev_find(net, oldflp->fl4_src, false))
a210d01a 2619 goto out;
a210d01a 2620 }
1da177e4
LT
2621 }
2622
2623
2624 if (oldflp->oif) {
0197aa38 2625 dev_out = dev_get_by_index_rcu(net, oldflp->oif);
1da177e4
LT
2626 err = -ENODEV;
2627 if (dev_out == NULL)
2628 goto out;
e5ed6399
HX
2629
2630 /* RACE: Check return value of inet_select_addr instead. */
fc75fc83
ED
2631 if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
2632 err = -ENETUNREACH;
2633 goto out;
2634 }
f97c1e0c 2635 if (ipv4_is_local_multicast(oldflp->fl4_dst) ||
27a954bd 2636 ipv4_is_lbcast(oldflp->fl4_dst)) {
1da177e4
LT
2637 if (!fl.fl4_src)
2638 fl.fl4_src = inet_select_addr(dev_out, 0,
2639 RT_SCOPE_LINK);
2640 goto make_route;
2641 }
2642 if (!fl.fl4_src) {
f97c1e0c 2643 if (ipv4_is_multicast(oldflp->fl4_dst))
1da177e4
LT
2644 fl.fl4_src = inet_select_addr(dev_out, 0,
2645 fl.fl4_scope);
2646 else if (!oldflp->fl4_dst)
2647 fl.fl4_src = inet_select_addr(dev_out, 0,
2648 RT_SCOPE_HOST);
2649 }
2650 }
2651
2652 if (!fl.fl4_dst) {
2653 fl.fl4_dst = fl.fl4_src;
2654 if (!fl.fl4_dst)
2655 fl.fl4_dst = fl.fl4_src = htonl(INADDR_LOOPBACK);
b40afd0e 2656 dev_out = net->loopback_dev;
b40afd0e 2657 fl.oif = net->loopback_dev->ifindex;
1da177e4
LT
2658 res.type = RTN_LOCAL;
2659 flags |= RTCF_LOCAL;
2660 goto make_route;
2661 }
2662
b40afd0e 2663 if (fib_lookup(net, &fl, &res)) {
1da177e4
LT
2664 res.fi = NULL;
2665 if (oldflp->oif) {
2666 /* Apparently, routing tables are wrong. Assume,
2667 that the destination is on link.
2668
2669 WHY? DW.
2670 Because we are allowed to send to iface
2671 even if it has NO routes and NO assigned
2672 addresses. When oif is specified, routing
2673 tables are looked up with only one purpose:
2674 to catch if destination is gatewayed, rather than
2675 direct. Moreover, if MSG_DONTROUTE is set,
2676 we send packet, ignoring both routing tables
2677 and ifaddr state. --ANK
2678
2679
2680 We could make it even if oif is unknown,
2681 likely IPv6, but we do not.
2682 */
2683
2684 if (fl.fl4_src == 0)
2685 fl.fl4_src = inet_select_addr(dev_out, 0,
2686 RT_SCOPE_LINK);
2687 res.type = RTN_UNICAST;
2688 goto make_route;
2689 }
1da177e4
LT
2690 err = -ENETUNREACH;
2691 goto out;
2692 }
1da177e4
LT
2693
2694 if (res.type == RTN_LOCAL) {
9fc3bbb4
JS
2695 if (!fl.fl4_src) {
2696 if (res.fi->fib_prefsrc)
2697 fl.fl4_src = res.fi->fib_prefsrc;
2698 else
2699 fl.fl4_src = fl.fl4_dst;
2700 }
b40afd0e 2701 dev_out = net->loopback_dev;
1da177e4 2702 fl.oif = dev_out->ifindex;
1da177e4
LT
2703 res.fi = NULL;
2704 flags |= RTCF_LOCAL;
2705 goto make_route;
2706 }
2707
2708#ifdef CONFIG_IP_ROUTE_MULTIPATH
2709 if (res.fi->fib_nhs > 1 && fl.oif == 0)
2710 fib_select_multipath(&fl, &res);
2711 else
2712#endif
2713 if (!res.prefixlen && res.type == RTN_UNICAST && !fl.oif)
0c838ff1 2714 fib_select_default(&res);
1da177e4
LT
2715
2716 if (!fl.fl4_src)
2717 fl.fl4_src = FIB_RES_PREFSRC(res);
2718
1da177e4 2719 dev_out = FIB_RES_DEV(res);
1da177e4
LT
2720 fl.oif = dev_out->ifindex;
2721
2722
2723make_route:
2724 err = ip_mkroute_output(rp, &res, &fl, oldflp, dev_out, flags);
2725
1da177e4
LT
2726out: return err;
2727}
2728
611c183e
DL
2729int __ip_route_output_key(struct net *net, struct rtable **rp,
2730 const struct flowi *flp)
1da177e4 2731{
0197aa38
ED
2732 unsigned int hash;
2733 int res;
1da177e4
LT
2734 struct rtable *rth;
2735
1080d709
NH
2736 if (!rt_caching(net))
2737 goto slow_output;
2738
e84f84f2 2739 hash = rt_hash(flp->fl4_dst, flp->fl4_src, flp->oif, rt_genid(net));
1da177e4
LT
2740
2741 rcu_read_lock_bh();
a898def2 2742 for (rth = rcu_dereference_bh(rt_hash_table[hash].chain); rth;
d8d1f30b 2743 rth = rcu_dereference_bh(rth->dst.rt_next)) {
1da177e4
LT
2744 if (rth->fl.fl4_dst == flp->fl4_dst &&
2745 rth->fl.fl4_src == flp->fl4_src &&
c7537967 2746 rt_is_output_route(rth) &&
1da177e4 2747 rth->fl.oif == flp->oif &&
47dcf0cb 2748 rth->fl.mark == flp->mark &&
1da177e4 2749 !((rth->fl.fl4_tos ^ flp->fl4_tos) &
b5921910 2750 (IPTOS_RT_MASK | RTO_ONLINK)) &&
d8d1f30b 2751 net_eq(dev_net(rth->dst.dev), net) &&
e84f84f2 2752 !rt_is_expired(rth)) {
d8d1f30b 2753 dst_use(&rth->dst, jiffies);
1da177e4
LT
2754 RT_CACHE_STAT_INC(out_hit);
2755 rcu_read_unlock_bh();
2756 *rp = rth;
2757 return 0;
2758 }
2759 RT_CACHE_STAT_INC(out_hlist_search);
2760 }
2761 rcu_read_unlock_bh();
2762
1080d709 2763slow_output:
0197aa38
ED
2764 rcu_read_lock();
2765 res = ip_route_output_slow(net, rp, flp);
2766 rcu_read_unlock();
2767 return res;
1da177e4 2768}
d8c97a94
ACM
2769EXPORT_SYMBOL_GPL(__ip_route_output_key);
2770
ae2688d5
JW
2771static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2772{
2773 return NULL;
2774}
2775
ec831ea7
RD
2776static unsigned int ipv4_blackhole_default_mtu(const struct dst_entry *dst)
2777{
2778 return 0;
2779}
2780
14e50e57
DM
2781static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
2782{
2783}
2784
2785static struct dst_ops ipv4_dst_blackhole_ops = {
2786 .family = AF_INET,
09640e63 2787 .protocol = cpu_to_be16(ETH_P_IP),
14e50e57 2788 .destroy = ipv4_dst_destroy,
ae2688d5 2789 .check = ipv4_blackhole_dst_check,
ec831ea7 2790 .default_mtu = ipv4_blackhole_default_mtu,
14e50e57 2791 .update_pmtu = ipv4_rt_blackhole_update_pmtu,
14e50e57
DM
2792};
2793
2794
e84f84f2 2795static int ipv4_dst_blackhole(struct net *net, struct rtable **rp, struct flowi *flp)
14e50e57
DM
2796{
2797 struct rtable *ort = *rp;
2798 struct rtable *rt = (struct rtable *)
2799 dst_alloc(&ipv4_dst_blackhole_ops);
2800
2801 if (rt) {
d8d1f30b 2802 struct dst_entry *new = &rt->dst;
14e50e57
DM
2803
2804 atomic_set(&new->__refcnt, 1);
2805 new->__use = 1;
352e512c
HX
2806 new->input = dst_discard;
2807 new->output = dst_discard;
defb3519 2808 dst_copy_metrics(new, &ort->dst);
14e50e57 2809
d8d1f30b 2810 new->dev = ort->dst.dev;
14e50e57
DM
2811 if (new->dev)
2812 dev_hold(new->dev);
2813
2814 rt->fl = ort->fl;
2815
e84f84f2 2816 rt->rt_genid = rt_genid(net);
14e50e57
DM
2817 rt->rt_flags = ort->rt_flags;
2818 rt->rt_type = ort->rt_type;
2819 rt->rt_dst = ort->rt_dst;
2820 rt->rt_src = ort->rt_src;
2821 rt->rt_iif = ort->rt_iif;
2822 rt->rt_gateway = ort->rt_gateway;
2823 rt->rt_spec_dst = ort->rt_spec_dst;
2824 rt->peer = ort->peer;
2825 if (rt->peer)
2826 atomic_inc(&rt->peer->refcnt);
62fa8a84
DM
2827 rt->fi = ort->fi;
2828 if (rt->fi)
2829 atomic_inc(&rt->fi->fib_clntref);
14e50e57
DM
2830
2831 dst_free(new);
2832 }
2833
d8d1f30b 2834 dst_release(&(*rp)->dst);
14e50e57 2835 *rp = rt;
a02cec21 2836 return rt ? 0 : -ENOMEM;
14e50e57
DM
2837}
2838
f1b050bf
DL
2839int ip_route_output_flow(struct net *net, struct rtable **rp, struct flowi *flp,
2840 struct sock *sk, int flags)
1da177e4
LT
2841{
2842 int err;
2843
f1b050bf 2844 if ((err = __ip_route_output_key(net, rp, flp)) != 0)
1da177e4
LT
2845 return err;
2846
2847 if (flp->proto) {
2848 if (!flp->fl4_src)
2849 flp->fl4_src = (*rp)->rt_src;
2850 if (!flp->fl4_dst)
2851 flp->fl4_dst = (*rp)->rt_dst;
52479b62 2852 err = __xfrm_lookup(net, (struct dst_entry **)rp, flp, sk,
bb72845e 2853 flags ? XFRM_LOOKUP_WAIT : 0);
14e50e57 2854 if (err == -EREMOTE)
e84f84f2 2855 err = ipv4_dst_blackhole(net, rp, flp);
14e50e57
DM
2856
2857 return err;
1da177e4
LT
2858 }
2859
2860 return 0;
2861}
d8c97a94
ACM
2862EXPORT_SYMBOL_GPL(ip_route_output_flow);
2863
f206351a 2864int ip_route_output_key(struct net *net, struct rtable **rp, struct flowi *flp)
1da177e4 2865{
f206351a 2866 return ip_route_output_flow(net, rp, flp, NULL, 0);
1da177e4 2867}
4bc2f18b 2868EXPORT_SYMBOL(ip_route_output_key);
1da177e4 2869
4feb88e5
BT
2870static int rt_fill_info(struct net *net,
2871 struct sk_buff *skb, u32 pid, u32 seq, int event,
b6544c0b 2872 int nowait, unsigned int flags)
1da177e4 2873{
511c3f92 2874 struct rtable *rt = skb_rtable(skb);
1da177e4 2875 struct rtmsg *r;
be403ea1 2876 struct nlmsghdr *nlh;
e3703b3d
TG
2877 long expires;
2878 u32 id = 0, ts = 0, tsage = 0, error;
be403ea1
TG
2879
2880 nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
2881 if (nlh == NULL)
26932566 2882 return -EMSGSIZE;
be403ea1
TG
2883
2884 r = nlmsg_data(nlh);
1da177e4
LT
2885 r->rtm_family = AF_INET;
2886 r->rtm_dst_len = 32;
2887 r->rtm_src_len = 0;
2888 r->rtm_tos = rt->fl.fl4_tos;
2889 r->rtm_table = RT_TABLE_MAIN;
be403ea1 2890 NLA_PUT_U32(skb, RTA_TABLE, RT_TABLE_MAIN);
1da177e4
LT
2891 r->rtm_type = rt->rt_type;
2892 r->rtm_scope = RT_SCOPE_UNIVERSE;
2893 r->rtm_protocol = RTPROT_UNSPEC;
2894 r->rtm_flags = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2895 if (rt->rt_flags & RTCF_NOTIFY)
2896 r->rtm_flags |= RTM_F_NOTIFY;
be403ea1 2897
17fb2c64 2898 NLA_PUT_BE32(skb, RTA_DST, rt->rt_dst);
be403ea1 2899
1da177e4
LT
2900 if (rt->fl.fl4_src) {
2901 r->rtm_src_len = 32;
17fb2c64 2902 NLA_PUT_BE32(skb, RTA_SRC, rt->fl.fl4_src);
1da177e4 2903 }
d8d1f30b
CG
2904 if (rt->dst.dev)
2905 NLA_PUT_U32(skb, RTA_OIF, rt->dst.dev->ifindex);
c7066f70 2906#ifdef CONFIG_IP_ROUTE_CLASSID
d8d1f30b
CG
2907 if (rt->dst.tclassid)
2908 NLA_PUT_U32(skb, RTA_FLOW, rt->dst.tclassid);
1da177e4 2909#endif
c7537967 2910 if (rt_is_input_route(rt))
17fb2c64 2911 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_spec_dst);
1da177e4 2912 else if (rt->rt_src != rt->fl.fl4_src)
17fb2c64 2913 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_src);
be403ea1 2914
1da177e4 2915 if (rt->rt_dst != rt->rt_gateway)
17fb2c64 2916 NLA_PUT_BE32(skb, RTA_GATEWAY, rt->rt_gateway);
be403ea1 2917
defb3519 2918 if (rtnetlink_put_metrics(skb, dst_metrics_ptr(&rt->dst)) < 0)
be403ea1
TG
2919 goto nla_put_failure;
2920
963bfeee
ED
2921 if (rt->fl.mark)
2922 NLA_PUT_BE32(skb, RTA_MARK, rt->fl.mark);
2923
d8d1f30b
CG
2924 error = rt->dst.error;
2925 expires = rt->dst.expires ? rt->dst.expires - jiffies : 0;
1da177e4 2926 if (rt->peer) {
317fe0e6 2927 inet_peer_refcheck(rt->peer);
2c1409a0 2928 id = atomic_read(&rt->peer->ip_id_count) & 0xffff;
1da177e4 2929 if (rt->peer->tcp_ts_stamp) {
e3703b3d 2930 ts = rt->peer->tcp_ts;
9d729f72 2931 tsage = get_seconds() - rt->peer->tcp_ts_stamp;
1da177e4
LT
2932 }
2933 }
be403ea1 2934
c7537967 2935 if (rt_is_input_route(rt)) {
1da177e4 2936#ifdef CONFIG_IP_MROUTE
e448515c 2937 __be32 dst = rt->rt_dst;
1da177e4 2938
f97c1e0c 2939 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
4feb88e5
BT
2940 IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2941 int err = ipmr_get_route(net, skb, r, nowait);
1da177e4
LT
2942 if (err <= 0) {
2943 if (!nowait) {
2944 if (err == 0)
2945 return 0;
be403ea1 2946 goto nla_put_failure;
1da177e4
LT
2947 } else {
2948 if (err == -EMSGSIZE)
be403ea1 2949 goto nla_put_failure;
e3703b3d 2950 error = err;
1da177e4
LT
2951 }
2952 }
2953 } else
2954#endif
be403ea1 2955 NLA_PUT_U32(skb, RTA_IIF, rt->fl.iif);
1da177e4
LT
2956 }
2957
d8d1f30b 2958 if (rtnl_put_cacheinfo(skb, &rt->dst, id, ts, tsage,
e3703b3d
TG
2959 expires, error) < 0)
2960 goto nla_put_failure;
be403ea1
TG
2961
2962 return nlmsg_end(skb, nlh);
1da177e4 2963
be403ea1 2964nla_put_failure:
26932566
PM
2965 nlmsg_cancel(skb, nlh);
2966 return -EMSGSIZE;
1da177e4
LT
2967}
2968
63f3444f 2969static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
1da177e4 2970{
3b1e0a65 2971 struct net *net = sock_net(in_skb->sk);
d889ce3b
TG
2972 struct rtmsg *rtm;
2973 struct nlattr *tb[RTA_MAX+1];
1da177e4 2974 struct rtable *rt = NULL;
9e12bb22
AV
2975 __be32 dst = 0;
2976 __be32 src = 0;
2977 u32 iif;
d889ce3b 2978 int err;
963bfeee 2979 int mark;
1da177e4
LT
2980 struct sk_buff *skb;
2981
d889ce3b
TG
2982 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
2983 if (err < 0)
2984 goto errout;
2985
2986 rtm = nlmsg_data(nlh);
2987
1da177e4 2988 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
d889ce3b
TG
2989 if (skb == NULL) {
2990 err = -ENOBUFS;
2991 goto errout;
2992 }
1da177e4
LT
2993
2994 /* Reserve room for dummy headers, this skb can pass
2995 through good chunk of routing engine.
2996 */
459a98ed 2997 skb_reset_mac_header(skb);
c1d2bbe1 2998 skb_reset_network_header(skb);
d2c962b8
SH
2999
3000 /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
eddc9ec5 3001 ip_hdr(skb)->protocol = IPPROTO_ICMP;
1da177e4
LT
3002 skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
3003
17fb2c64
AV
3004 src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
3005 dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
d889ce3b 3006 iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
963bfeee 3007 mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
1da177e4
LT
3008
3009 if (iif) {
d889ce3b
TG
3010 struct net_device *dev;
3011
1937504d 3012 dev = __dev_get_by_index(net, iif);
d889ce3b
TG
3013 if (dev == NULL) {
3014 err = -ENODEV;
3015 goto errout_free;
3016 }
3017
1da177e4
LT
3018 skb->protocol = htons(ETH_P_IP);
3019 skb->dev = dev;
963bfeee 3020 skb->mark = mark;
1da177e4
LT
3021 local_bh_disable();
3022 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
3023 local_bh_enable();
d889ce3b 3024
511c3f92 3025 rt = skb_rtable(skb);
d8d1f30b
CG
3026 if (err == 0 && rt->dst.error)
3027 err = -rt->dst.error;
1da177e4 3028 } else {
d889ce3b 3029 struct flowi fl = {
5811662b
CG
3030 .fl4_dst = dst,
3031 .fl4_src = src,
3032 .fl4_tos = rtm->rtm_tos,
d889ce3b 3033 .oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
963bfeee 3034 .mark = mark,
d889ce3b 3035 };
1937504d 3036 err = ip_route_output_key(net, &rt, &fl);
1da177e4 3037 }
d889ce3b 3038
1da177e4 3039 if (err)
d889ce3b 3040 goto errout_free;
1da177e4 3041
d8d1f30b 3042 skb_dst_set(skb, &rt->dst);
1da177e4
LT
3043 if (rtm->rtm_flags & RTM_F_NOTIFY)
3044 rt->rt_flags |= RTCF_NOTIFY;
3045
4feb88e5 3046 err = rt_fill_info(net, skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
1937504d 3047 RTM_NEWROUTE, 0, 0);
d889ce3b
TG
3048 if (err <= 0)
3049 goto errout_free;
1da177e4 3050
1937504d 3051 err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
d889ce3b 3052errout:
2942e900 3053 return err;
1da177e4 3054
d889ce3b 3055errout_free:
1da177e4 3056 kfree_skb(skb);
d889ce3b 3057 goto errout;
1da177e4
LT
3058}
3059
3060int ip_rt_dump(struct sk_buff *skb, struct netlink_callback *cb)
3061{
3062 struct rtable *rt;
3063 int h, s_h;
3064 int idx, s_idx;
1937504d
DL
3065 struct net *net;
3066
3b1e0a65 3067 net = sock_net(skb->sk);
1da177e4
LT
3068
3069 s_h = cb->args[0];
d8c92830
ED
3070 if (s_h < 0)
3071 s_h = 0;
1da177e4 3072 s_idx = idx = cb->args[1];
a6272665
ED
3073 for (h = s_h; h <= rt_hash_mask; h++, s_idx = 0) {
3074 if (!rt_hash_table[h].chain)
3075 continue;
1da177e4 3076 rcu_read_lock_bh();
a898def2 3077 for (rt = rcu_dereference_bh(rt_hash_table[h].chain), idx = 0; rt;
d8d1f30b
CG
3078 rt = rcu_dereference_bh(rt->dst.rt_next), idx++) {
3079 if (!net_eq(dev_net(rt->dst.dev), net) || idx < s_idx)
1da177e4 3080 continue;
e84f84f2 3081 if (rt_is_expired(rt))
29e75252 3082 continue;
d8d1f30b 3083 skb_dst_set_noref(skb, &rt->dst);
4feb88e5 3084 if (rt_fill_info(net, skb, NETLINK_CB(cb->skb).pid,
e905a9ed 3085 cb->nlh->nlmsg_seq, RTM_NEWROUTE,
b6544c0b 3086 1, NLM_F_MULTI) <= 0) {
adf30907 3087 skb_dst_drop(skb);
1da177e4
LT
3088 rcu_read_unlock_bh();
3089 goto done;
3090 }
adf30907 3091 skb_dst_drop(skb);
1da177e4
LT
3092 }
3093 rcu_read_unlock_bh();
3094 }
3095
3096done:
3097 cb->args[0] = h;
3098 cb->args[1] = idx;
3099 return skb->len;
3100}
3101
3102void ip_rt_multicast_event(struct in_device *in_dev)
3103{
76e6ebfb 3104 rt_cache_flush(dev_net(in_dev->dev), 0);
1da177e4
LT
3105}
3106
3107#ifdef CONFIG_SYSCTL
81c684d1 3108static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write,
8d65af78 3109 void __user *buffer,
1da177e4
LT
3110 size_t *lenp, loff_t *ppos)
3111{
3112 if (write) {
639e104f 3113 int flush_delay;
81c684d1 3114 ctl_table ctl;
39a23e75 3115 struct net *net;
639e104f 3116
81c684d1
DL
3117 memcpy(&ctl, __ctl, sizeof(ctl));
3118 ctl.data = &flush_delay;
8d65af78 3119 proc_dointvec(&ctl, write, buffer, lenp, ppos);
639e104f 3120
81c684d1 3121 net = (struct net *)__ctl->extra1;
39a23e75 3122 rt_cache_flush(net, flush_delay);
1da177e4 3123 return 0;
e905a9ed 3124 }
1da177e4
LT
3125
3126 return -EINVAL;
3127}
3128
eeb61f71 3129static ctl_table ipv4_route_table[] = {
1da177e4 3130 {
1da177e4
LT
3131 .procname = "gc_thresh",
3132 .data = &ipv4_dst_ops.gc_thresh,
3133 .maxlen = sizeof(int),
3134 .mode = 0644,
6d9f239a 3135 .proc_handler = proc_dointvec,
1da177e4
LT
3136 },
3137 {
1da177e4
LT
3138 .procname = "max_size",
3139 .data = &ip_rt_max_size,
3140 .maxlen = sizeof(int),
3141 .mode = 0644,
6d9f239a 3142 .proc_handler = proc_dointvec,
1da177e4
LT
3143 },
3144 {
3145 /* Deprecated. Use gc_min_interval_ms */
e905a9ed 3146
1da177e4
LT
3147 .procname = "gc_min_interval",
3148 .data = &ip_rt_gc_min_interval,
3149 .maxlen = sizeof(int),
3150 .mode = 0644,
6d9f239a 3151 .proc_handler = proc_dointvec_jiffies,
1da177e4
LT
3152 },
3153 {
1da177e4
LT
3154 .procname = "gc_min_interval_ms",
3155 .data = &ip_rt_gc_min_interval,
3156 .maxlen = sizeof(int),
3157 .mode = 0644,
6d9f239a 3158 .proc_handler = proc_dointvec_ms_jiffies,
1da177e4
LT
3159 },
3160 {
1da177e4
LT
3161 .procname = "gc_timeout",
3162 .data = &ip_rt_gc_timeout,
3163 .maxlen = sizeof(int),
3164 .mode = 0644,
6d9f239a 3165 .proc_handler = proc_dointvec_jiffies,
1da177e4
LT
3166 },
3167 {
1da177e4
LT
3168 .procname = "gc_interval",
3169 .data = &ip_rt_gc_interval,
3170 .maxlen = sizeof(int),
3171 .mode = 0644,
6d9f239a 3172 .proc_handler = proc_dointvec_jiffies,
1da177e4
LT
3173 },
3174 {
1da177e4
LT
3175 .procname = "redirect_load",
3176 .data = &ip_rt_redirect_load,
3177 .maxlen = sizeof(int),
3178 .mode = 0644,
6d9f239a 3179 .proc_handler = proc_dointvec,
1da177e4
LT
3180 },
3181 {
1da177e4
LT
3182 .procname = "redirect_number",
3183 .data = &ip_rt_redirect_number,
3184 .maxlen = sizeof(int),
3185 .mode = 0644,
6d9f239a 3186 .proc_handler = proc_dointvec,
1da177e4
LT
3187 },
3188 {
1da177e4
LT
3189 .procname = "redirect_silence",
3190 .data = &ip_rt_redirect_silence,
3191 .maxlen = sizeof(int),
3192 .mode = 0644,
6d9f239a 3193 .proc_handler = proc_dointvec,
1da177e4
LT
3194 },
3195 {
1da177e4
LT
3196 .procname = "error_cost",
3197 .data = &ip_rt_error_cost,
3198 .maxlen = sizeof(int),
3199 .mode = 0644,
6d9f239a 3200 .proc_handler = proc_dointvec,
1da177e4
LT
3201 },
3202 {
1da177e4
LT
3203 .procname = "error_burst",
3204 .data = &ip_rt_error_burst,
3205 .maxlen = sizeof(int),
3206 .mode = 0644,
6d9f239a 3207 .proc_handler = proc_dointvec,
1da177e4
LT
3208 },
3209 {
1da177e4
LT
3210 .procname = "gc_elasticity",
3211 .data = &ip_rt_gc_elasticity,
3212 .maxlen = sizeof(int),
3213 .mode = 0644,
6d9f239a 3214 .proc_handler = proc_dointvec,
1da177e4
LT
3215 },
3216 {
1da177e4
LT
3217 .procname = "mtu_expires",
3218 .data = &ip_rt_mtu_expires,
3219 .maxlen = sizeof(int),
3220 .mode = 0644,
6d9f239a 3221 .proc_handler = proc_dointvec_jiffies,
1da177e4
LT
3222 },
3223 {
1da177e4
LT
3224 .procname = "min_pmtu",
3225 .data = &ip_rt_min_pmtu,
3226 .maxlen = sizeof(int),
3227 .mode = 0644,
6d9f239a 3228 .proc_handler = proc_dointvec,
1da177e4
LT
3229 },
3230 {
1da177e4
LT
3231 .procname = "min_adv_mss",
3232 .data = &ip_rt_min_advmss,
3233 .maxlen = sizeof(int),
3234 .mode = 0644,
6d9f239a 3235 .proc_handler = proc_dointvec,
1da177e4 3236 },
f8572d8f 3237 { }
1da177e4 3238};
39a23e75 3239
2f4520d3
AV
3240static struct ctl_table empty[1];
3241
3242static struct ctl_table ipv4_skeleton[] =
3243{
f8572d8f 3244 { .procname = "route",
d994af0d 3245 .mode = 0555, .child = ipv4_route_table},
f8572d8f 3246 { .procname = "neigh",
d994af0d 3247 .mode = 0555, .child = empty},
2f4520d3
AV
3248 { }
3249};
3250
3251static __net_initdata struct ctl_path ipv4_path[] = {
f8572d8f
EB
3252 { .procname = "net", },
3253 { .procname = "ipv4", },
39a23e75
DL
3254 { },
3255};
3256
39a23e75
DL
3257static struct ctl_table ipv4_route_flush_table[] = {
3258 {
39a23e75
DL
3259 .procname = "flush",
3260 .maxlen = sizeof(int),
3261 .mode = 0200,
6d9f239a 3262 .proc_handler = ipv4_sysctl_rtcache_flush,
39a23e75 3263 },
f8572d8f 3264 { },
39a23e75
DL
3265};
3266
2f4520d3 3267static __net_initdata struct ctl_path ipv4_route_path[] = {
f8572d8f
EB
3268 { .procname = "net", },
3269 { .procname = "ipv4", },
3270 { .procname = "route", },
2f4520d3
AV
3271 { },
3272};
3273
39a23e75
DL
3274static __net_init int sysctl_route_net_init(struct net *net)
3275{
3276 struct ctl_table *tbl;
3277
3278 tbl = ipv4_route_flush_table;
09ad9bc7 3279 if (!net_eq(net, &init_net)) {
39a23e75
DL
3280 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
3281 if (tbl == NULL)
3282 goto err_dup;
3283 }
3284 tbl[0].extra1 = net;
3285
3286 net->ipv4.route_hdr =
3287 register_net_sysctl_table(net, ipv4_route_path, tbl);
3288 if (net->ipv4.route_hdr == NULL)
3289 goto err_reg;
3290 return 0;
3291
3292err_reg:
3293 if (tbl != ipv4_route_flush_table)
3294 kfree(tbl);
3295err_dup:
3296 return -ENOMEM;
3297}
3298
3299static __net_exit void sysctl_route_net_exit(struct net *net)
3300{
3301 struct ctl_table *tbl;
3302
3303 tbl = net->ipv4.route_hdr->ctl_table_arg;
3304 unregister_net_sysctl_table(net->ipv4.route_hdr);
3305 BUG_ON(tbl == ipv4_route_flush_table);
3306 kfree(tbl);
3307}
3308
3309static __net_initdata struct pernet_operations sysctl_route_ops = {
3310 .init = sysctl_route_net_init,
3311 .exit = sysctl_route_net_exit,
3312};
1da177e4
LT
3313#endif
3314
3ee94372 3315static __net_init int rt_genid_init(struct net *net)
9f5e97e5 3316{
3ee94372
NH
3317 get_random_bytes(&net->ipv4.rt_genid,
3318 sizeof(net->ipv4.rt_genid));
9f5e97e5
DL
3319 return 0;
3320}
3321
3ee94372
NH
3322static __net_initdata struct pernet_operations rt_genid_ops = {
3323 .init = rt_genid_init,
9f5e97e5
DL
3324};
3325
3326
c7066f70 3327#ifdef CONFIG_IP_ROUTE_CLASSID
7d720c3e 3328struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
c7066f70 3329#endif /* CONFIG_IP_ROUTE_CLASSID */
1da177e4
LT
3330
3331static __initdata unsigned long rhash_entries;
3332static int __init set_rhash_entries(char *str)
3333{
3334 if (!str)
3335 return 0;
3336 rhash_entries = simple_strtoul(str, &str, 0);
3337 return 1;
3338}
3339__setup("rhash_entries=", set_rhash_entries);
3340
3341int __init ip_rt_init(void)
3342{
424c4b70 3343 int rc = 0;
1da177e4 3344
c7066f70 3345#ifdef CONFIG_IP_ROUTE_CLASSID
0dcec8c2 3346 ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
1da177e4
LT
3347 if (!ip_rt_acct)
3348 panic("IP: failed to allocate ip_rt_acct\n");
1da177e4
LT
3349#endif
3350
e5d679f3
AD
3351 ipv4_dst_ops.kmem_cachep =
3352 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
20c2df83 3353 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
1da177e4 3354
14e50e57
DM
3355 ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3356
fc66f95c
ED
3357 if (dst_entries_init(&ipv4_dst_ops) < 0)
3358 panic("IP: failed to allocate ipv4_dst_ops counter\n");
3359
3360 if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
3361 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
3362
424c4b70
ED
3363 rt_hash_table = (struct rt_hash_bucket *)
3364 alloc_large_system_hash("IP route cache",
3365 sizeof(struct rt_hash_bucket),
3366 rhash_entries,
4481374c 3367 (totalram_pages >= 128 * 1024) ?
18955cfc 3368 15 : 17,
8d1502de 3369 0,
424c4b70
ED
3370 &rt_hash_log,
3371 &rt_hash_mask,
c9503e0f 3372 rhash_entries ? 0 : 512 * 1024);
22c047cc
ED
3373 memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
3374 rt_hash_lock_init();
1da177e4
LT
3375
3376 ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
3377 ip_rt_max_size = (rt_hash_mask + 1) * 16;
3378
1da177e4
LT
3379 devinet_init();
3380 ip_fib_init();
3381
1da177e4
LT
3382 /* All the timers, started at system startup tend
3383 to synchronize. Perturb it a bit.
3384 */
125bb8f5
ED
3385 INIT_DELAYED_WORK_DEFERRABLE(&expires_work, rt_worker_func);
3386 expires_ljiffies = jiffies;
39c90ece
ED
3387 schedule_delayed_work(&expires_work,
3388 net_random() % ip_rt_gc_interval + ip_rt_gc_interval);
1da177e4 3389
73b38711 3390 if (ip_rt_proc_init())
107f1634 3391 printk(KERN_ERR "Unable to create route proc files\n");
1da177e4
LT
3392#ifdef CONFIG_XFRM
3393 xfrm_init();
a33bc5c1 3394 xfrm4_init(ip_rt_max_size);
1da177e4 3395#endif
63f3444f
TG
3396 rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL);
3397
39a23e75
DL
3398#ifdef CONFIG_SYSCTL
3399 register_pernet_subsys(&sysctl_route_ops);
3400#endif
3ee94372 3401 register_pernet_subsys(&rt_genid_ops);
1da177e4
LT
3402 return rc;
3403}
3404
a1bc6eb4 3405#ifdef CONFIG_SYSCTL
eeb61f71
AV
3406/*
3407 * We really need to sanitize the damn ipv4 init order, then all
3408 * this nonsense will go away.
3409 */
3410void __init ip_static_sysctl_init(void)
3411{
2f4520d3 3412 register_sysctl_paths(ipv4_path, ipv4_skeleton);
eeb61f71 3413}
a1bc6eb4 3414#endif
This page took 1.364006 seconds and 4 git commands to generate.