net/ipv4/route.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              ROUTE - implementation of the IP router.
   7  *
   8  * Authors:     Ross Biro
   9  *              Fred N. van Kempen, <[email protected]>
  10  *              Alan Cox, <[email protected]>
  11  *              Linus Torvalds, <[email protected]>
  12  *              Alexey Kuznetsov, <[email protected]>
  13  *
  14  * Fixes:
  15  *              Alan Cox        :       Verify area fixes.
  16  *              Alan Cox        :       cli() protects routing changes
  17  *              Rui Oliveira    :       ICMP routing table updates
  18  *              ([email protected])      Routing table insertion and update
  19  *              Linus Torvalds  :       Rewrote bits to be sensible
  20  *              Alan Cox        :       Added BSD route gw semantics
  21  *              Alan Cox        :       Super /proc >4K
  22  *              Alan Cox        :       MTU in route table
  23  *              Alan Cox        :       MSS actually. Also added the window
  24  *                                      clamper.
  25  *              Sam Lantinga    :       Fixed route matching in rt_del()
  26  *              Alan Cox        :       Routing cache support.
  27  *              Alan Cox        :       Removed compatibility cruft.
  28  *              Alan Cox        :       RTF_REJECT support.
  29  *              Alan Cox        :       TCP irtt support.
  30  *              Jonathan Naylor :       Added Metric support.
  31  *      Miquel van Smoorenburg  :       BSD API fixes.
  32  *      Miquel van Smoorenburg  :       Metrics.
  33  *              Alan Cox        :       Use __u32 properly
  34  *              Alan Cox        :       Aligned routing errors more closely with BSD
  35  *                                      our system is still very different.
  36  *              Alan Cox        :       Faster /proc handling
  37  *      Alexey Kuznetsov        :       Massive rework to support tree based routing,
  38  *                                      routing caches and better behaviour.
  39  *
  40  *              Olaf Erb        :       irtt wasn't being copied right.
  41  *              Bjorn Ekwall    :       Kerneld route support.
  42  *              Alan Cox        :       Multicast fixed (I hope)
  43  *              Pavel Krauz     :       Limited broadcast fixed
  44  *              Mike McLagan    :       Routing by source
  45  *      Alexey Kuznetsov        :       End of old history. Split to fib.c and
  46  *                                      route.c and rewritten from scratch.
  47  *              Andi Kleen      :       Load-limit warning messages.
  48  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
  49  *      Vitaly E. Lavrov        :       Race condition in ip_route_input_slow.
  50  *      Tobias Ringstrom        :       Uninitialized res.type in ip_route_output_slow.
  51  *      Vladimir V. Ivanov      :       IP rule info (flowid) is really useful.
  52  *              Marc Boucher    :       routing by fwmark
  53  *      Robert Olsson           :       Added rt_cache statistics
  54  *      Arnaldo C. Melo         :       Convert proc stuff to seq_file
  55  *      Eric Dumazet            :       hashed spinlocks and rt_check_expire() fixes.
  56  *      Ilia Sotnikov           :       Ignore TOS on PMTUD and Redirect
  57  *      Ilia Sotnikov           :       Removed TOS from hash calculations
  58  *
  59  *              This program is free software; you can redistribute it and/or
  60  *              modify it under the terms of the GNU General Public License
  61  *              as published by the Free Software Foundation; either version
  62  *              2 of the License, or (at your option) any later version.
  63  */
  64
  65 #include <linux/module.h>
  66 #include <asm/uaccess.h>
  67 #include <asm/system.h>
  68 #include <linux/bitops.h>
  69 #include <linux/types.h>
  70 #include <linux/kernel.h>
  71 #include <linux/mm.h>
  72 #include <linux/bootmem.h>
  73 #include <linux/string.h>
  74 #include <linux/socket.h>
  75 #include <linux/sockios.h>
  76 #include <linux/errno.h>
  77 #include <linux/in.h>
  78 #include <linux/inet.h>
  79 #include <linux/netdevice.h>
  80 #include <linux/proc_fs.h>
  81 #include <linux/init.h>
  82 #include <linux/workqueue.h>
  83 #include <linux/skbuff.h>
  84 #include <linux/inetdevice.h>
  85 #include <linux/igmp.h>
  86 #include <linux/pkt_sched.h>
  87 #include <linux/mroute.h>
  88 #include <linux/netfilter_ipv4.h>
  89 #include <linux/random.h>
  90 #include <linux/jhash.h>
  91 #include <linux/rcupdate.h>
  92 #include <linux/times.h>
  93 #include <net/dst.h>
  94 #include <net/net_namespace.h>
  95 #include <net/protocol.h>
  96 #include <net/ip.h>
  97 #include <net/route.h>
  98 #include <net/inetpeer.h>
  99 #include <net/sock.h>
 100 #include <net/ip_fib.h>
 101 #include <net/arp.h>
 102 #include <net/tcp.h>
 103 #include <net/icmp.h>
 104 #include <net/xfrm.h>
 105 #include <net/netevent.h>
 106 #include <net/rtnetlink.h>
 107 #ifdef CONFIG_SYSCTL
 108 #include <linux/sysctl.h>
 109 #endif
 110
 111 #define RT_FL_TOS(oldflp) \
 112     ((u32)(oldflp->fl4_tos & (IPTOS_RT_MASK | RTO_ONLINK)))
 113
 114 #define IP_MAX_MTU      0xFFF0
 115
 116 #define RT_GC_TIMEOUT (300*HZ)
 117
 118 static int ip_rt_max_size;
 119 static int ip_rt_gc_timeout __read_mostly       = RT_GC_TIMEOUT;
 120 static int ip_rt_gc_interval __read_mostly      = 60 * HZ;
 121 static int ip_rt_gc_min_interval __read_mostly  = HZ / 2;
 122 static int ip_rt_redirect_number __read_mostly  = 9;
 123 static int ip_rt_redirect_load __read_mostly    = HZ / 50;
 124 static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
 125 static int ip_rt_error_cost __read_mostly       = HZ;
 126 static int ip_rt_error_burst __read_mostly      = 5 * HZ;
 127 static int ip_rt_gc_elasticity __read_mostly    = 8;
 128 static int ip_rt_mtu_expires __read_mostly      = 10 * 60 * HZ;
 129 static int ip_rt_min_pmtu __read_mostly         = 512 + 20 + 20;
 130 static int ip_rt_min_advmss __read_mostly       = 256;
 131 static int ip_rt_secret_interval __read_mostly  = 10 * 60 * HZ;
 132 static int rt_chain_length_max __read_mostly    = 20;
 133
 134 static struct delayed_work expires_work;
 135 static unsigned long expires_ljiffies;
 136
 137 /*
 138  *      Interface to generic destination cache.
 139  */
 140
 141 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
 142 static void              ipv4_dst_destroy(struct dst_entry *dst);
 143 static void              ipv4_dst_ifdown(struct dst_entry *dst,
 144                                          struct net_device *dev, int how);
 145 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
 146 static void              ipv4_link_failure(struct sk_buff *skb);
 147 static void              ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
 148 static int rt_garbage_collect(struct dst_ops *ops);
 149 static void rt_emergency_hash_rebuild(struct net *net);
 150
 151
 152 static struct dst_ops ipv4_dst_ops = {
 153         .family =               AF_INET,
 154         .protocol =             cpu_to_be16(ETH_P_IP),
 155         .gc =                   rt_garbage_collect,
 156         .check =                ipv4_dst_check,
 157         .destroy =              ipv4_dst_destroy,
 158         .ifdown =               ipv4_dst_ifdown,
 159         .negative_advice =      ipv4_negative_advice,
 160         .link_failure =         ipv4_link_failure,
 161         .update_pmtu =          ip_rt_update_pmtu,
 162         .local_out =            __ip_local_out,
 163         .entries =              ATOMIC_INIT(0),
 164 };
 165
 166 #define ECN_OR_COST(class)      TC_PRIO_##class
 167
 168 const __u8 ip_tos2prio[16] = {
 169         TC_PRIO_BESTEFFORT,
 170         ECN_OR_COST(FILLER),
 171         TC_PRIO_BESTEFFORT,
 172         ECN_OR_COST(BESTEFFORT),
 173         TC_PRIO_BULK,
 174         ECN_OR_COST(BULK),
 175         TC_PRIO_BULK,
 176         ECN_OR_COST(BULK),
 177         TC_PRIO_INTERACTIVE,
 178         ECN_OR_COST(INTERACTIVE),
 179         TC_PRIO_INTERACTIVE,
 180         ECN_OR_COST(INTERACTIVE),
 181         TC_PRIO_INTERACTIVE_BULK,
 182         ECN_OR_COST(INTERACTIVE_BULK),
 183         TC_PRIO_INTERACTIVE_BULK,
 184         ECN_OR_COST(INTERACTIVE_BULK)
 185 };
 186
 187
 188 /*
 189  * Route cache.
 190  */
 191
 192 /* The locking scheme is rather straight forward:
 193  *
 194  * 1) Read-Copy Update protects the buckets of the central route hash.
 195  * 2) Only writers remove entries, and they hold the lock
 196  *    as they look at rtable reference counts.
 197  * 3) Only readers acquire references to rtable entries,
 198  *    they do so with atomic increments and with the
 199  *    lock held.
 200  */
 201
 202 struct rt_hash_bucket {
 203         struct rtable   *chain;
 204 };
 205
 206 #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
 207         defined(CONFIG_PROVE_LOCKING)
 208 /*
 209  * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
 210  * The size of this table is a power of two and depends on the number of CPUS.
 211  * (on lockdep we have a quite big spinlock_t, so keep the size down there)
 212  */
 213 #ifdef CONFIG_LOCKDEP
 214 # define RT_HASH_LOCK_SZ        256
 215 #else
 216 # if NR_CPUS >= 32
 217 #  define RT_HASH_LOCK_SZ       4096
 218 # elif NR_CPUS >= 16
 219 #  define RT_HASH_LOCK_SZ       2048
 220 # elif NR_CPUS >= 8
 221 #  define RT_HASH_LOCK_SZ       1024
 222 # elif NR_CPUS >= 4
 223 #  define RT_HASH_LOCK_SZ       512
 224 # else
 225 #  define RT_HASH_LOCK_SZ       256
 226 # endif
 227 #endif
 228
 229 static spinlock_t       *rt_hash_locks;
 230 # define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
 231
 232 static __init void rt_hash_lock_init(void)
 233 {
 234         int i;
 235
 236         rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ,
 237                         GFP_KERNEL);
 238         if (!rt_hash_locks)
 239                 panic("IP: failed to allocate rt_hash_locks\n");
 240
 241         for (i = 0; i < RT_HASH_LOCK_SZ; i++)
 242                 spin_lock_init(&rt_hash_locks[i]);
 243 }
 244 #else
 245 # define rt_hash_lock_addr(slot) NULL
 246
 247 static inline void rt_hash_lock_init(void)
 248 {
 249 }
 250 #endif
 251
 252 static struct rt_hash_bucket    *rt_hash_table __read_mostly;
 253 static unsigned                 rt_hash_mask __read_mostly;
 254 static unsigned int             rt_hash_log  __read_mostly;
 255
 256 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
 257 #define RT_CACHE_STAT_INC(field) \
 258         (__raw_get_cpu_var(rt_cache_stat).field++)
 259
 260 static inline unsigned int rt_hash(__be32 daddr, __be32 saddr, int idx,
 261                 int genid)
 262 {
 263         return jhash_3words((__force u32)(__be32)(daddr),
 264                             (__force u32)(__be32)(saddr),
 265                             idx, genid)
 266                 & rt_hash_mask;
 267 }
 268
 269 static inline int rt_genid(struct net *net)
 270 {
 271         return atomic_read(&net->ipv4.rt_genid);
 272 }
 273
 274 #ifdef CONFIG_PROC_FS
 275 struct rt_cache_iter_state {
 276         struct seq_net_private p;
 277         int bucket;
 278         int genid;
 279 };
 280
 281 static struct rtable *rt_cache_get_first(struct seq_file *seq)
 282 {
 283         struct rt_cache_iter_state *st = seq->private;
 284         struct rtable *r = NULL;
 285
 286         for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
 287                 if (!rt_hash_table[st->bucket].chain)
 288                         continue;
 289                 rcu_read_lock_bh();
 290                 r = rcu_dereference(rt_hash_table[st->bucket].chain);
 291                 while (r) {
 292                         if (dev_net(r->u.dst.dev) == seq_file_net(seq) &&
 293                             r->rt_genid == st->genid)
 294                                 return r;
 295                         r = rcu_dereference(r->u.dst.rt_next);
 296                 }
 297                 rcu_read_unlock_bh();
 298         }
 299         return r;
 300 }
 301
 302 static struct rtable *__rt_cache_get_next(struct seq_file *seq,
 303                                           struct rtable *r)
 304 {
 305         struct rt_cache_iter_state *st = seq->private;
 306
 307         r = r->u.dst.rt_next;
 308         while (!r) {
 309                 rcu_read_unlock_bh();
 310                 do {
 311                         if (--st->bucket < 0)
 312                                 return NULL;
 313                 } while (!rt_hash_table[st->bucket].chain);
 314                 rcu_read_lock_bh();
 315                 r = rt_hash_table[st->bucket].chain;
 316         }
 317         return rcu_dereference(r);
 318 }
 319
 320 static struct rtable *rt_cache_get_next(struct seq_file *seq,
 321                                         struct rtable *r)
 322 {
 323         struct rt_cache_iter_state *st = seq->private;
 324         while ((r = __rt_cache_get_next(seq, r)) != NULL) {
 325                 if (dev_net(r->u.dst.dev) != seq_file_net(seq))
 326                         continue;
 327                 if (r->rt_genid == st->genid)
 328                         break;
 329         }
 330         return r;
 331 }
 332
 333 static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
 334 {
 335         struct rtable *r = rt_cache_get_first(seq);
 336
 337         if (r)
 338                 while (pos && (r = rt_cache_get_next(seq, r)))
 339                         --pos;
 340         return pos ? NULL : r;
 341 }
 342
 343 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
 344 {
 345         struct rt_cache_iter_state *st = seq->private;
 346         if (*pos)
 347                 return rt_cache_get_idx(seq, *pos - 1);
 348         st->genid = rt_genid(seq_file_net(seq));
 349         return SEQ_START_TOKEN;
 350 }
 351
 352 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 353 {
 354         struct rtable *r;
 355
 356         if (v == SEQ_START_TOKEN)
 357                 r = rt_cache_get_first(seq);
 358         else
 359                 r = rt_cache_get_next(seq, v);
 360         ++*pos;
 361         return r;
 362 }
 363
 364 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
 365 {
 366         if (v && v != SEQ_START_TOKEN)
 367                 rcu_read_unlock_bh();
 368 }
 369
 370 static int rt_cache_seq_show(struct seq_file *seq, void *v)
 371 {
 372         if (v == SEQ_START_TOKEN)
 373                 seq_printf(seq, "%-127s\n",
 374                            "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
 375                            "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
 376                            "HHUptod\tSpecDst");
 377         else {
 378                 struct rtable *r = v;
 379                 int len;
 380
 381                 seq_printf(seq, "%s\t%08lX\t%08lX\t%8X\t%d\t%u\t%d\t"
 382                               "%08lX\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X%n",
 383                         r->u.dst.dev ? r->u.dst.dev->name : "*",
 384                         (unsigned long)r->rt_dst, (unsigned long)r->rt_gateway,
 385                         r->rt_flags, atomic_read(&r->u.dst.__refcnt),
 386                         r->u.dst.__use, 0, (unsigned long)r->rt_src,
 387                         (dst_metric(&r->u.dst, RTAX_ADVMSS) ?
 388                              (int)dst_metric(&r->u.dst, RTAX_ADVMSS) + 40 : 0),
 389                         dst_metric(&r->u.dst, RTAX_WINDOW),
 390                         (int)((dst_metric(&r->u.dst, RTAX_RTT) >> 3) +
 391                               dst_metric(&r->u.dst, RTAX_RTTVAR)),
 392                         r->fl.fl4_tos,
 393                         r->u.dst.hh ? atomic_read(&r->u.dst.hh->hh_refcnt) : -1,
 394                         r->u.dst.hh ? (r->u.dst.hh->hh_output ==
 395                                        dev_queue_xmit) : 0,
 396                         r->rt_spec_dst, &len);
 397
 398                 seq_printf(seq, "%*s\n", 127 - len, "");
 399         }
 400         return 0;
 401 }
 402
 403 static const struct seq_operations rt_cache_seq_ops = {
 404         .start  = rt_cache_seq_start,
 405         .next   = rt_cache_seq_next,
 406         .stop   = rt_cache_seq_stop,
 407         .show   = rt_cache_seq_show,
 408 };
 409
 410 static int rt_cache_seq_open(struct inode *inode, struct file *file)
 411 {
 412         return seq_open_net(inode, file, &rt_cache_seq_ops,
 413                         sizeof(struct rt_cache_iter_state));
 414 }
 415
 416 static const struct file_operations rt_cache_seq_fops = {
 417         .owner   = THIS_MODULE,
 418         .open    = rt_cache_seq_open,
 419         .read    = seq_read,
 420         .llseek  = seq_lseek,
 421         .release = seq_release_net,
 422 };
 423
 424
 425 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
 426 {
 427         int cpu;
 428
 429         if (*pos == 0)
 430                 return SEQ_START_TOKEN;
 431
 432         for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
 433                 if (!cpu_possible(cpu))
 434                         continue;
 435                 *pos = cpu+1;
 436                 return &per_cpu(rt_cache_stat, cpu);
 437         }
 438         return NULL;
 439 }
 440
 441 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 442 {
 443         int cpu;
 444
 445         for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
 446                 if (!cpu_possible(cpu))
 447                         continue;
 448                 *pos = cpu+1;
 449                 return &per_cpu(rt_cache_stat, cpu);
 450         }
 451         return NULL;
 452
 453 }
 454
 455 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
 456 {
 457
 458 }
 459
 460 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
 461 {
 462         struct rt_cache_stat *st = v;
 463
 464         if (v == SEQ_START_TOKEN) {
 465                 seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
 466                 return 0;
 467         }
 468
 469         seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
 470                    " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
 471                    atomic_read(&ipv4_dst_ops.entries),
 472                    st->in_hit,
 473                    st->in_slow_tot,
 474                    st->in_slow_mc,
 475                    st->in_no_route,
 476                    st->in_brd,
 477                    st->in_martian_dst,
 478                    st->in_martian_src,
 479
 480                    st->out_hit,
 481                    st->out_slow_tot,
 482                    st->out_slow_mc,
 483
 484                    st->gc_total,
 485                    st->gc_ignored,
 486                    st->gc_goal_miss,
 487                    st->gc_dst_overflow,
 488                    st->in_hlist_search,
 489                    st->out_hlist_search
 490                 );
 491         return 0;
 492 }
 493
 494 static const struct seq_operations rt_cpu_seq_ops = {
 495         .start  = rt_cpu_seq_start,
 496         .next   = rt_cpu_seq_next,
 497         .stop   = rt_cpu_seq_stop,
 498         .show   = rt_cpu_seq_show,
 499 };
 500
 501
 502 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
 503 {
 504         return seq_open(file, &rt_cpu_seq_ops);
 505 }
 506
 507 static const struct file_operations rt_cpu_seq_fops = {
 508         .owner   = THIS_MODULE,
 509         .open    = rt_cpu_seq_open,
 510         .read    = seq_read,
 511         .llseek  = seq_lseek,
 512         .release = seq_release,
 513 };
 514
 515 #ifdef CONFIG_NET_CLS_ROUTE
 516 static int rt_acct_proc_show(struct seq_file *m, void *v)
 517 {
 518         struct ip_rt_acct *dst, *src;
 519         unsigned int i, j;
 520
 521         dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
 522         if (!dst)
 523                 return -ENOMEM;
 524
 525         for_each_possible_cpu(i) {
 526                 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
 527                 for (j = 0; j < 256; j++) {
 528                         dst[j].o_bytes   += src[j].o_bytes;
 529                         dst[j].o_packets += src[j].o_packets;
 530                         dst[j].i_bytes   += src[j].i_bytes;
 531                         dst[j].i_packets += src[j].i_packets;
 532                 }
 533         }
 534
 535         seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
 536         kfree(dst);
 537         return 0;
 538 }
 539
 540 static int rt_acct_proc_open(struct inode *inode, struct file *file)
 541 {
 542         return single_open(file, rt_acct_proc_show, NULL);
 543 }
 544
 545 static const struct file_operations rt_acct_proc_fops = {
 546         .owner          = THIS_MODULE,
 547         .open           = rt_acct_proc_open,
 548         .read           = seq_read,
 549         .llseek         = seq_lseek,
 550         .release        = single_release,
 551 };
 552 #endif
 553
 554 static int __net_init ip_rt_do_proc_init(struct net *net)
 555 {
 556         struct proc_dir_entry *pde;
 557
 558         pde = proc_net_fops_create(net, "rt_cache", S_IRUGO,
 559                         &rt_cache_seq_fops);
 560         if (!pde)
 561                 goto err1;
 562
 563         pde = proc_create("rt_cache", S_IRUGO,
 564                           net->proc_net_stat, &rt_cpu_seq_fops);
 565         if (!pde)
 566                 goto err2;
 567
 568 #ifdef CONFIG_NET_CLS_ROUTE
 569         pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
 570         if (!pde)
 571                 goto err3;
 572 #endif
 573         return 0;
 574
 575 #ifdef CONFIG_NET_CLS_ROUTE
 576 err3:
 577         remove_proc_entry("rt_cache", net->proc_net_stat);
 578 #endif
 579 err2:
 580         remove_proc_entry("rt_cache", net->proc_net);
 581 err1:
 582         return -ENOMEM;
 583 }
 584
 585 static void __net_exit ip_rt_do_proc_exit(struct net *net)
 586 {
 587         remove_proc_entry("rt_cache", net->proc_net_stat);
 588         remove_proc_entry("rt_cache", net->proc_net);
 589         remove_proc_entry("rt_acct", net->proc_net);
 590 }
 591
 592 static struct pernet_operations ip_rt_proc_ops __net_initdata =  {
 593         .init = ip_rt_do_proc_init,
 594         .exit = ip_rt_do_proc_exit,
 595 };
 596
 597 static int __init ip_rt_proc_init(void)
 598 {
 599         return register_pernet_subsys(&ip_rt_proc_ops);
 600 }
 601
 602 #else
 603 static inline int ip_rt_proc_init(void)
 604 {
 605         return 0;
 606 }
 607 #endif /* CONFIG_PROC_FS */
 608
 609 static inline void rt_free(struct rtable *rt)
 610 {
 611         call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
 612 }
 613
 614 static inline void rt_drop(struct rtable *rt)
 615 {
 616         ip_rt_put(rt);
 617         call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
 618 }
 619
 620 static inline int rt_fast_clean(struct rtable *rth)
 621 {
 622         /* Kill broadcast/multicast entries very aggresively, if they
 623            collide in hash table with more useful entries */
 624         return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
 625                 rth->fl.iif && rth->u.dst.rt_next;
 626 }
 627
 628 static inline int rt_valuable(struct rtable *rth)
 629 {
 630         return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
 631                 rth->u.dst.expires;
 632 }
 633
 634 static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
 635 {
 636         unsigned long age;
 637         int ret = 0;
 638
 639         if (atomic_read(&rth->u.dst.__refcnt))
 640                 goto out;
 641
 642         ret = 1;
 643         if (rth->u.dst.expires &&
 644             time_after_eq(jiffies, rth->u.dst.expires))
 645                 goto out;
 646
 647         age = jiffies - rth->u.dst.lastuse;
 648         ret = 0;
 649         if ((age <= tmo1 && !rt_fast_clean(rth)) ||
 650             (age <= tmo2 && rt_valuable(rth)))
 651                 goto out;
 652         ret = 1;
 653 out:    return ret;
 654 }
 655
 656 /* Bits of score are:
 657  * 31: very valuable
 658  * 30: not quite useless
 659  * 29..0: usage counter
 660  */
 661 static inline u32 rt_score(struct rtable *rt)
 662 {
 663         u32 score = jiffies - rt->u.dst.lastuse;
 664
 665         score = ~score & ~(3<<30);
 666
 667         if (rt_valuable(rt))
 668                 score |= (1<<31);
 669
 670         if (!rt->fl.iif ||
 671             !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
 672                 score |= (1<<30);
 673
 674         return score;
 675 }
 676
 677 static inline bool rt_caching(const struct net *net)
 678 {
 679         return net->ipv4.current_rt_cache_rebuild_count <=
 680                 net->ipv4.sysctl_rt_cache_rebuild_count;
 681 }
 682
 683 static inline bool compare_hash_inputs(const struct flowi *fl1,
 684                                         const struct flowi *fl2)
 685 {
 686         return (__force u32)(((fl1->nl_u.ip4_u.daddr ^ fl2->nl_u.ip4_u.daddr) |
 687                 (fl1->nl_u.ip4_u.saddr ^ fl2->nl_u.ip4_u.saddr) |
 688                 (fl1->iif ^ fl2->iif)) == 0);
 689 }
 690
 691 static inline int compare_keys(struct flowi *fl1, struct flowi *fl2)
 692 {
 693         return ((__force u32)((fl1->nl_u.ip4_u.daddr ^ fl2->nl_u.ip4_u.daddr) |
 694                 (fl1->nl_u.ip4_u.saddr ^ fl2->nl_u.ip4_u.saddr)) |
 695                 (fl1->mark ^ fl2->mark) |
 696                 (*(u16 *)&fl1->nl_u.ip4_u.tos ^
 697                  *(u16 *)&fl2->nl_u.ip4_u.tos) |
 698                 (fl1->oif ^ fl2->oif) |
 699                 (fl1->iif ^ fl2->iif)) == 0;
 700 }
 701
 702 static inline int compare_netns(struct rtable *rt1, struct rtable *rt2)
 703 {
 704         return net_eq(dev_net(rt1->u.dst.dev), dev_net(rt2->u.dst.dev));
 705 }
 706
 707 static inline int rt_is_expired(struct rtable *rth)
 708 {
 709         return rth->rt_genid != rt_genid(dev_net(rth->u.dst.dev));
 710 }
 711
 712 /*
 713  * Perform a full scan of hash table and free all entries.
 714  * Can be called by a softirq or a process.
 715  * In the later case, we want to be reschedule if necessary
 716  */
 717 static void rt_do_flush(int process_context)
 718 {
 719         unsigned int i;
 720         struct rtable *rth, *next;
 721         struct rtable * tail;
 722
 723         for (i = 0; i <= rt_hash_mask; i++) {
 724                 if (process_context && need_resched())
 725                         cond_resched();
 726                 rth = rt_hash_table[i].chain;
 727                 if (!rth)
 728                         continue;
 729
 730                 spin_lock_bh(rt_hash_lock_addr(i));
 731 #ifdef CONFIG_NET_NS
 732                 {
 733                 struct rtable ** prev, * p;
 734
 735                 rth = rt_hash_table[i].chain;
 736
 737                 /* defer releasing the head of the list after spin_unlock */
 738                 for (tail = rth; tail; tail = tail->u.dst.rt_next)
 739                         if (!rt_is_expired(tail))
 740                                 break;
 741                 if (rth != tail)
 742                         rt_hash_table[i].chain = tail;
 743
 744                 /* call rt_free on entries after the tail requiring flush */
 745                 prev = &rt_hash_table[i].chain;
 746                 for (p = *prev; p; p = next) {
 747                         next = p->u.dst.rt_next;
 748                         if (!rt_is_expired(p)) {
 749                                 prev = &p->u.dst.rt_next;
 750                         } else {
 751                                 *prev = next;
 752                                 rt_free(p);
 753                         }
 754                 }
 755                 }
 756 #else
 757                 rth = rt_hash_table[i].chain;
 758                 rt_hash_table[i].chain = NULL;
 759                 tail = NULL;
 760 #endif
 761                 spin_unlock_bh(rt_hash_lock_addr(i));
 762
 763                 for (; rth != tail; rth = next) {
 764                         next = rth->u.dst.rt_next;
 765                         rt_free(rth);
 766                 }
 767         }
 768 }
 769
 770 /*
 771  * While freeing expired entries, we compute average chain length
 772  * and standard deviation, using fixed-point arithmetic.
 773  * This to have an estimation of rt_chain_length_max
 774  *  rt_chain_length_max = max(elasticity, AVG + 4*SD)
 775  * We use 3 bits for frational part, and 29 (or 61) for magnitude.
 776  */
 777
 778 #define FRACT_BITS 3
 779 #define ONE (1UL << FRACT_BITS)
 780
 781 static void rt_check_expire(void)
 782 {
 783         static unsigned int rover;
 784         unsigned int i = rover, goal;
 785         struct rtable *rth, *aux, **rthp;
 786         unsigned long samples = 0;
 787         unsigned long sum = 0, sum2 = 0;
 788         unsigned long delta;
 789         u64 mult;
 790
 791         delta = jiffies - expires_ljiffies;
 792         expires_ljiffies = jiffies;
 793         mult = ((u64)delta) << rt_hash_log;
 794         if (ip_rt_gc_timeout > 1)
 795                 do_div(mult, ip_rt_gc_timeout);
 796         goal = (unsigned int)mult;
 797         if (goal > rt_hash_mask)
 798                 goal = rt_hash_mask + 1;
 799         for (; goal > 0; goal--) {
 800                 unsigned long tmo = ip_rt_gc_timeout;
 801                 unsigned long length;
 802
 803                 i = (i + 1) & rt_hash_mask;
 804                 rthp = &rt_hash_table[i].chain;
 805
 806                 if (need_resched())
 807                         cond_resched();
 808
 809                 samples++;
 810
 811                 if (*rthp == NULL)
 812                         continue;
 813                 length = 0;
 814                 spin_lock_bh(rt_hash_lock_addr(i));
 815                 while ((rth = *rthp) != NULL) {
 816                         prefetch(rth->u.dst.rt_next);
 817                         if (rt_is_expired(rth)) {
 818                                 *rthp = rth->u.dst.rt_next;
 819                                 rt_free(rth);
 820                                 continue;
 821                         }
 822                         if (rth->u.dst.expires) {
 823                                 /* Entry is expired even if it is in use */
 824                                 if (time_before_eq(jiffies, rth->u.dst.expires)) {
 825 nofree:
 826                                         tmo >>= 1;
 827                                         rthp = &rth->u.dst.rt_next;
 828                                         /*
 829                                          * We only count entries on
 830                                          * a chain with equal hash inputs once
 831                                          * so that entries for different QOS
 832                                          * levels, and other non-hash input
 833                                          * attributes don't unfairly skew
 834                                          * the length computation
 835                                          */
 836                                         for (aux = rt_hash_table[i].chain;;) {
 837                                                 if (aux == rth) {
 838                                                         length += ONE;
 839                                                         break;
 840                                                 }
 841                                                 if (compare_hash_inputs(&aux->fl, &rth->fl))
 842                                                         break;
 843                                                 aux = aux->u.dst.rt_next;
 844                                         }
 845                                         continue;
 846                                 }
 847                         } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout))
 848                                 goto nofree;
 849
 850                         /* Cleanup aged off entries. */
 851                         *rthp = rth->u.dst.rt_next;
 852                         rt_free(rth);
 853                 }
 854                 spin_unlock_bh(rt_hash_lock_addr(i));
 855                 sum += length;
 856                 sum2 += length*length;
 857         }
 858         if (samples) {
 859                 unsigned long avg = sum / samples;
 860                 unsigned long sd = int_sqrt(sum2 / samples - avg*avg);
 861                 rt_chain_length_max = max_t(unsigned long,
 862                                         ip_rt_gc_elasticity,
 863                                         (avg + 4*sd) >> FRACT_BITS);
 864         }
 865         rover = i;
 866 }
 867
 868 /*
 869  * rt_worker_func() is run in process context.
 870  * we call rt_check_expire() to scan part of the hash table
 871  */
 872 static void rt_worker_func(struct work_struct *work)
 873 {
 874         rt_check_expire();
 875         schedule_delayed_work(&expires_work, ip_rt_gc_interval);
 876 }
 877
 878 /*
 879  * Pertubation of rt_genid by a small quantity [1..256]
 880  * Using 8 bits of shuffling ensure we can call rt_cache_invalidate()
 881  * many times (2^24) without giving recent rt_genid.
 882  * Jenkins hash is strong enough that litle changes of rt_genid are OK.
 883  */
 884 static void rt_cache_invalidate(struct net *net)
 885 {
 886         unsigned char shuffle;
 887
 888         get_random_bytes(&shuffle, sizeof(shuffle));
 889         atomic_add(shuffle + 1U, &net->ipv4.rt_genid);
 890 }
 891
 892 /*
 893  * delay < 0  : invalidate cache (fast : entries will be deleted later)
 894  * delay >= 0 : invalidate & flush cache (can be long)
 895  */
 896 void rt_cache_flush(struct net *net, int delay)
 897 {
 898         rt_cache_invalidate(net);
 899         if (delay >= 0)
 900                 rt_do_flush(!in_softirq());
 901 }
 902
 903 /*
 904  * We change rt_genid and let gc do the cleanup
 905  */
 906 static void rt_secret_rebuild(unsigned long __net)
 907 {
 908         struct net *net = (struct net *)__net;
 909         rt_cache_invalidate(net);
 910         mod_timer(&net->ipv4.rt_secret_timer, jiffies + ip_rt_secret_interval);
 911 }
 912
 913 static void rt_secret_rebuild_oneshot(struct net *net)
 914 {
 915         del_timer_sync(&net->ipv4.rt_secret_timer);
 916         rt_cache_invalidate(net);
 917         if (ip_rt_secret_interval) {
 918                 net->ipv4.rt_secret_timer.expires += ip_rt_secret_interval;
 919                 add_timer(&net->ipv4.rt_secret_timer);
 920         }
 921 }
 922
 923 static void rt_emergency_hash_rebuild(struct net *net)
 924 {
 925         if (net_ratelimit()) {
 926                 printk(KERN_WARNING "Route hash chain too long!\n");
 927                 printk(KERN_WARNING "Adjust your secret_interval!\n");
 928         }
 929
 930         rt_secret_rebuild_oneshot(net);
 931 }
 932
 933 /*
 934    Short description of GC goals.
 935
 936    We want to build algorithm, which will keep routing cache
 937    at some equilibrium point, when number of aged off entries
 938    is kept approximately equal to newly generated ones.
 939
 940    Current expiration strength is variable "expire".
 941    We try to adjust it dynamically, so that if networking
 942    is idle expires is large enough to keep enough of warm entries,
 943    and when load increases it reduces to limit cache size.
 944  */
 945
 946 static int rt_garbage_collect(struct dst_ops *ops)
 947 {
 948         static unsigned long expire = RT_GC_TIMEOUT;
 949         static unsigned long last_gc;
 950         static int rover;
 951         static int equilibrium;
 952         struct rtable *rth, **rthp;
 953         unsigned long now = jiffies;
 954         int goal;
 955
 956         /*
 957          * Garbage collection is pretty expensive,
 958          * do not make it too frequently.
 959          */
 960
 961         RT_CACHE_STAT_INC(gc_total);
 962
 963         if (now - last_gc < ip_rt_gc_min_interval &&
 964             atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size) {
 965                 RT_CACHE_STAT_INC(gc_ignored);
 966                 goto out;
 967         }
 968
 969         /* Calculate number of entries, which we want to expire now. */
 970         goal = atomic_read(&ipv4_dst_ops.entries) -
 971                 (ip_rt_gc_elasticity << rt_hash_log);
 972         if (goal <= 0) {
 973                 if (equilibrium < ipv4_dst_ops.gc_thresh)
 974                         equilibrium = ipv4_dst_ops.gc_thresh;
 975                 goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
 976                 if (goal > 0) {
 977                         equilibrium += min_t(unsigned int, goal >> 1, rt_hash_mask + 1);
 978                         goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
 979                 }
 980         } else {
 981                 /* We are in dangerous area. Try to reduce cache really
 982                  * aggressively.
 983                  */
 984                 goal = max_t(unsigned int, goal >> 1, rt_hash_mask + 1);
 985                 equilibrium = atomic_read(&ipv4_dst_ops.entries) - goal;
 986         }
 987
 988         if (now - last_gc >= ip_rt_gc_min_interval)
 989                 last_gc = now;
 990
 991         if (goal <= 0) {
 992                 equilibrium += goal;
 993                 goto work_done;
 994         }
 995
 996         do {
 997                 int i, k;
 998
 999                 for (i = rt_hash_mask, k = rover; i >= 0; i--) {
1000                         unsigned long tmo = expire;
1001
1002                         k = (k + 1) & rt_hash_mask;
1003                         rthp = &rt_hash_table[k].chain;
1004                         spin_lock_bh(rt_hash_lock_addr(k));
1005                         while ((rth = *rthp) != NULL) {
1006                                 if (!rt_is_expired(rth) &&
1007                                         !rt_may_expire(rth, tmo, expire)) {
1008                                         tmo >>= 1;
1009                                         rthp = &rth->u.dst.rt_next;
1010                                         continue;
1011                                 }
1012                                 *rthp = rth->u.dst.rt_next;
1013                                 rt_free(rth);
1014                                 goal--;
1015                         }
1016                         spin_unlock_bh(rt_hash_lock_addr(k));
1017                         if (goal <= 0)
1018                                 break;
1019                 }
1020                 rover = k;
1021
1022                 if (goal <= 0)
1023                         goto work_done;
1024
1025                 /* Goal is not achieved. We stop process if:
1026
1027                    - if expire reduced to zero. Otherwise, expire is halfed.
1028                    - if table is not full.
1029                    - if we are called from interrupt.
1030                    - jiffies check is just fallback/debug loop breaker.
1031                      We will not spin here for long time in any case.
1032                  */
1033
1034                 RT_CACHE_STAT_INC(gc_goal_miss);
1035
1036                 if (expire == 0)
1037                         break;
1038
1039                 expire >>= 1;
1040 #if RT_CACHE_DEBUG >= 2
1041                 printk(KERN_DEBUG "expire>> %u %d %d %d\n", expire,
1042                                 atomic_read(&ipv4_dst_ops.entries), goal, i);
1043 #endif
1044
1045                 if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
1046                         goto out;
1047         } while (!in_softirq() && time_before_eq(jiffies, now));
1048
1049         if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
1050                 goto out;
1051         if (net_ratelimit())
1052                 printk(KERN_WARNING "dst cache overflow\n");
1053         RT_CACHE_STAT_INC(gc_dst_overflow);
1054         return 1;
1055
1056 work_done:
1057         expire += ip_rt_gc_min_interval;
1058         if (expire > ip_rt_gc_timeout ||
1059             atomic_read(&ipv4_dst_ops.entries) < ipv4_dst_ops.gc_thresh)
1060                 expire = ip_rt_gc_timeout;
1061 #if RT_CACHE_DEBUG >= 2
1062         printk(KERN_DEBUG "expire++ %u %d %d %d\n", expire,
1063                         atomic_read(&ipv4_dst_ops.entries), goal, rover);
1064 #endif
1065 out:    return 0;
1066 }
1067
1068 static int rt_intern_hash(unsigned hash, struct rtable *rt,
1069                           struct rtable **rp, struct sk_buff *skb)
1070 {
1071         struct rtable   *rth, **rthp;
1072         unsigned long   now;
1073         struct rtable *cand, **candp;
1074         u32             min_score;
1075         int             chain_length;
1076         int attempts = !in_softirq();
1077
1078 restart:
1079         chain_length = 0;
1080         min_score = ~(u32)0;
1081         cand = NULL;
1082         candp = NULL;
1083         now = jiffies;
1084
1085         if (!rt_caching(dev_net(rt->u.dst.dev))) {
1086                 /*
1087                  * If we're not caching, just tell the caller we
1088                  * were successful and don't touch the route.  The
1089                  * caller hold the sole reference to the cache entry, and
1090                  * it will be released when the caller is done with it.
1091                  * If we drop it here, the callers have no way to resolve routes
1092                  * when we're not caching.  Instead, just point *rp at rt, so
1093                  * the caller gets a single use out of the route
1094                  * Note that we do rt_free on this new route entry, so that
1095                  * once its refcount hits zero, we are still able to reap it
1096                  * (Thanks Alexey)
1097                  * Note also the rt_free uses call_rcu.  We don't actually
1098                  * need rcu protection here, this is just our path to get
1099                  * on the route gc list.
1100                  */
1101
1102                 if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) {
1103                         int err = arp_bind_neighbour(&rt->u.dst);
1104                         if (err) {
1105                                 if (net_ratelimit())
1106                                         printk(KERN_WARNING
1107                                             "Neighbour table failure & not caching routes.\n");
1108                                 rt_drop(rt);
1109                                 return err;
1110                         }
1111                 }
1112
1113                 rt_free(rt);
1114                 goto skip_hashing;
1115         }
1116
1117         rthp = &rt_hash_table[hash].chain;
1118
1119         spin_lock_bh(rt_hash_lock_addr(hash));
1120         while ((rth = *rthp) != NULL) {
1121                 if (rt_is_expired(rth)) {
1122                         *rthp = rth->u.dst.rt_next;
1123                         rt_free(rth);
1124                         continue;
1125                 }
1126                 if (compare_keys(&rth->fl, &rt->fl) && compare_netns(rth, rt)) {
1127                         /* Put it first */
1128                         *rthp = rth->u.dst.rt_next;
1129                         /*
1130                          * Since lookup is lockfree, the deletion
1131                          * must be visible to another weakly ordered CPU before
1132                          * the insertion at the start of the hash chain.
1133                          */
1134                         rcu_assign_pointer(rth->u.dst.rt_next,
1135                                            rt_hash_table[hash].chain);
1136                         /*
1137                          * Since lookup is lockfree, the update writes
1138                          * must be ordered for consistency on SMP.
1139                          */
1140                         rcu_assign_pointer(rt_hash_table[hash].chain, rth);
1141
1142                         dst_use(&rth->u.dst, now);
1143                         spin_unlock_bh(rt_hash_lock_addr(hash));
1144
1145                         rt_drop(rt);
1146                         if (rp)
1147                                 *rp = rth;
1148                         else
1149                                 skb_dst_set(skb, &rth->u.dst);
1150                         return 0;
1151                 }
1152
1153                 if (!atomic_read(&rth->u.dst.__refcnt)) {
1154                         u32 score = rt_score(rth);
1155
1156                         if (score <= min_score) {
1157                                 cand = rth;
1158                                 candp = rthp;
1159                                 min_score = score;
1160                         }
1161                 }
1162
1163                 chain_length++;
1164
1165                 rthp = &rth->u.dst.rt_next;
1166         }
1167
1168         if (cand) {
1169                 /* ip_rt_gc_elasticity used to be average length of chain
1170                  * length, when exceeded gc becomes really aggressive.
1171                  *
1172                  * The second limit is less certain. At the moment it allows
1173                  * only 2 entries per bucket. We will see.
1174                  */
1175                 if (chain_length > ip_rt_gc_elasticity) {
1176                         *candp = cand->u.dst.rt_next;
1177                         rt_free(cand);
1178                 }
1179         } else {
1180                 if (chain_length > rt_chain_length_max) {
1181                         struct net *net = dev_net(rt->u.dst.dev);
1182                         int num = ++net->ipv4.current_rt_cache_rebuild_count;
1183                         if (!rt_caching(dev_net(rt->u.dst.dev))) {
1184                                 printk(KERN_WARNING "%s: %d rebuilds is over limit, route caching disabled\n",
1185                                         rt->u.dst.dev->name, num);
1186                         }
1187                         rt_emergency_hash_rebuild(dev_net(rt->u.dst.dev));
1188                 }
1189         }
1190
1191         /* Try to bind route to arp only if it is output
1192            route or unicast forwarding path.
1193          */
1194         if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) {
1195                 int err = arp_bind_neighbour(&rt->u.dst);
1196                 if (err) {
1197                         spin_unlock_bh(rt_hash_lock_addr(hash));
1198
1199                         if (err != -ENOBUFS) {
1200                                 rt_drop(rt);
1201                                 return err;
1202                         }
1203
1204                         /* Neighbour tables are full and nothing
1205                            can be released. Try to shrink route cache,
1206                            it is most likely it holds some neighbour records.
1207                          */
1208                         if (attempts-- > 0) {
1209                                 int saved_elasticity = ip_rt_gc_elasticity;
1210                                 int saved_int = ip_rt_gc_min_interval;
1211                                 ip_rt_gc_elasticity     = 1;
1212                                 ip_rt_gc_min_interval   = 0;
1213                                 rt_garbage_collect(&ipv4_dst_ops);
1214                                 ip_rt_gc_min_interval   = saved_int;
1215                                 ip_rt_gc_elasticity     = saved_elasticity;
1216                                 goto restart;
1217                         }
1218
1219                         if (net_ratelimit())
1220                                 printk(KERN_WARNING "Neighbour table overflow.\n");
1221                         rt_drop(rt);
1222                         return -ENOBUFS;
1223                 }
1224         }
1225
1226         rt->u.dst.rt_next = rt_hash_table[hash].chain;
1227
1228 #if RT_CACHE_DEBUG >= 2
1229         if (rt->u.dst.rt_next) {
1230                 struct rtable *trt;
1231                 printk(KERN_DEBUG "rt_cache @%02x: %pI4",
1232                        hash, &rt->rt_dst);
1233                 for (trt = rt->u.dst.rt_next; trt; trt = trt->u.dst.rt_next)
1234                         printk(" . %pI4", &trt->rt_dst);
1235                 printk("\n");
1236         }
1237 #endif
1238         /*
1239          * Since lookup is lockfree, we must make sure
1240          * previous writes to rt are comitted to memory
1241          * before making rt visible to other CPUS.
1242          */
1243         rcu_assign_pointer(rt_hash_table[hash].chain, rt);
1244
1245         spin_unlock_bh(rt_hash_lock_addr(hash));
1246
1247 skip_hashing:
1248         if (rp)
1249                 *rp = rt;
1250         else
1251                 skb_dst_set(skb, &rt->u.dst);
1252         return 0;
1253 }
1254
1255 void rt_bind_peer(struct rtable *rt, int create)
1256 {
1257         static DEFINE_SPINLOCK(rt_peer_lock);
1258         struct inet_peer *peer;
1259
1260         peer = inet_getpeer(rt->rt_dst, create);
1261
1262         spin_lock_bh(&rt_peer_lock);
1263         if (rt->peer == NULL) {
1264                 rt->peer = peer;
1265                 peer = NULL;
1266         }
1267         spin_unlock_bh(&rt_peer_lock);
1268         if (peer)
1269                 inet_putpeer(peer);
1270 }
1271
1272 /*
1273  * Peer allocation may fail only in serious out-of-memory conditions.  However
1274  * we still can generate some output.
1275  * Random ID selection looks a bit dangerous because we have no chances to
1276  * select ID being unique in a reasonable period of time.
1277  * But broken packet identifier may be better than no packet at all.
1278  */
1279 static void ip_select_fb_ident(struct iphdr *iph)
1280 {
1281         static DEFINE_SPINLOCK(ip_fb_id_lock);
1282         static u32 ip_fallback_id;
1283         u32 salt;
1284
1285         spin_lock_bh(&ip_fb_id_lock);
1286         salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
1287         iph->id = htons(salt & 0xFFFF);
1288         ip_fallback_id = salt;
1289         spin_unlock_bh(&ip_fb_id_lock);
1290 }
1291
1292 void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
1293 {
1294         struct rtable *rt = (struct rtable *) dst;
1295
1296         if (rt) {
1297                 if (rt->peer == NULL)
1298                         rt_bind_peer(rt, 1);
1299
1300                 /* If peer is attached to destination, it is never detached,
1301                    so that we need not to grab a lock to dereference it.
1302                  */
1303                 if (rt->peer) {
1304                         iph->id = htons(inet_getid(rt->peer, more));
1305                         return;
1306                 }
1307         } else
1308                 printk(KERN_DEBUG "rt_bind_peer(0) @%p\n",
1309                        __builtin_return_address(0));
1310
1311         ip_select_fb_ident(iph);
1312 }
1313
1314 static void rt_del(unsigned hash, struct rtable *rt)
1315 {
1316         struct rtable **rthp, *aux;
1317
1318         rthp = &rt_hash_table[hash].chain;
1319         spin_lock_bh(rt_hash_lock_addr(hash));
1320         ip_rt_put(rt);
1321         while ((aux = *rthp) != NULL) {
1322                 if (aux == rt || rt_is_expired(aux)) {
1323                         *rthp = aux->u.dst.rt_next;
1324                         rt_free(aux);
1325                         continue;
1326                 }
1327                 rthp = &aux->u.dst.rt_next;
1328         }
1329         spin_unlock_bh(rt_hash_lock_addr(hash));
1330 }
1331
1332 void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1333                     __be32 saddr, struct net_device *dev)
1334 {
1335         int i, k;
1336         struct in_device *in_dev = in_dev_get(dev);
1337         struct rtable *rth, **rthp;
1338         __be32  skeys[2] = { saddr, 0 };
1339         int  ikeys[2] = { dev->ifindex, 0 };
1340         struct netevent_redirect netevent;
1341         struct net *net;
1342
1343         if (!in_dev)
1344                 return;
1345
1346         net = dev_net(dev);
1347         if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
1348             ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
1349             ipv4_is_zeronet(new_gw))
1350                 goto reject_redirect;
1351
1352         if (!rt_caching(net))
1353                 goto reject_redirect;
1354
1355         if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1356                 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1357                         goto reject_redirect;
1358                 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1359                         goto reject_redirect;
1360         } else {
1361                 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
1362                         goto reject_redirect;
1363         }
1364
1365         for (i = 0; i < 2; i++) {
1366                 for (k = 0; k < 2; k++) {
1367                         unsigned hash = rt_hash(daddr, skeys[i], ikeys[k],
1368                                                 rt_genid(net));
1369
1370                         rthp=&rt_hash_table[hash].chain;
1371
1372                         rcu_read_lock();
1373                         while ((rth = rcu_dereference(*rthp)) != NULL) {
1374                                 struct rtable *rt;
1375
1376                                 if (rth->fl.fl4_dst != daddr ||
1377                                     rth->fl.fl4_src != skeys[i] ||
1378                                     rth->fl.oif != ikeys[k] ||
1379                                     rth->fl.iif != 0 ||
1380                                     rt_is_expired(rth) ||
1381                                     !net_eq(dev_net(rth->u.dst.dev), net)) {
1382                                         rthp = &rth->u.dst.rt_next;
1383                                         continue;
1384                                 }
1385
1386                                 if (rth->rt_dst != daddr ||
1387                                     rth->rt_src != saddr ||
1388                                     rth->u.dst.error ||
1389                                     rth->rt_gateway != old_gw ||
1390                                     rth->u.dst.dev != dev)
1391                                         break;
1392
1393                                 dst_hold(&rth->u.dst);
1394                                 rcu_read_unlock();
1395
1396                                 rt = dst_alloc(&ipv4_dst_ops);
1397                                 if (rt == NULL) {
1398                                         ip_rt_put(rth);
1399                                         in_dev_put(in_dev);
1400                                         return;
1401                                 }
1402
1403                                 /* Copy all the information. */
1404                                 *rt = *rth;
1405                                 rt->u.dst.__use         = 1;
1406                                 atomic_set(&rt->u.dst.__refcnt, 1);
1407                                 rt->u.dst.child         = NULL;
1408                                 if (rt->u.dst.dev)
1409                                         dev_hold(rt->u.dst.dev);
1410                                 if (rt->idev)
1411                                         in_dev_hold(rt->idev);
1412                                 rt->u.dst.obsolete      = 0;
1413                                 rt->u.dst.lastuse       = jiffies;
1414                                 rt->u.dst.path          = &rt->u.dst;
1415                                 rt->u.dst.neighbour     = NULL;
1416                                 rt->u.dst.hh            = NULL;
1417 #ifdef CONFIG_XFRM
1418                                 rt->u.dst.xfrm          = NULL;
1419 #endif
1420                                 rt->rt_genid            = rt_genid(net);
1421                                 rt->rt_flags            |= RTCF_REDIRECTED;
1422
1423                                 /* Gateway is different ... */
1424                                 rt->rt_gateway          = new_gw;
1425
1426                                 /* Redirect received -> path was valid */
1427                                 dst_confirm(&rth->u.dst);
1428
1429                                 if (rt->peer)
1430                                         atomic_inc(&rt->peer->refcnt);
1431
1432                                 if (arp_bind_neighbour(&rt->u.dst) ||
1433                                     !(rt->u.dst.neighbour->nud_state &
1434                                             NUD_VALID)) {
1435                                         if (rt->u.dst.neighbour)
1436                                                 neigh_event_send(rt->u.dst.neighbour, NULL);
1437                                         ip_rt_put(rth);
1438                                         rt_drop(rt);
1439                                         goto do_next;
1440                                 }
1441
1442                                 netevent.old = &rth->u.dst;
1443                                 netevent.new = &rt->u.dst;
1444                                 call_netevent_notifiers(NETEVENT_REDIRECT,
1445                                                         &netevent);
1446
1447                                 rt_del(hash, rth);
1448                                 if (!rt_intern_hash(hash, rt, &rt, NULL))
1449                                         ip_rt_put(rt);
1450                                 goto do_next;
1451                         }
1452                         rcu_read_unlock();
1453                 do_next:
1454                         ;
1455                 }
1456         }
1457         in_dev_put(in_dev);
1458         return;
1459
1460 reject_redirect:
1461 #ifdef CONFIG_IP_ROUTE_VERBOSE
1462         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1463                 printk(KERN_INFO "Redirect from %pI4 on %s about %pI4 ignored.\n"
1464                         "  Advised path = %pI4 -> %pI4\n",
1465                        &old_gw, dev->name, &new_gw,
1466                        &saddr, &daddr);
1467 #endif
1468         in_dev_put(in_dev);
1469 }
1470
1471 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1472 {
1473         struct rtable *rt = (struct rtable *)dst;
1474         struct dst_entry *ret = dst;
1475
1476         if (rt) {
1477                 if (dst->obsolete) {
1478                         ip_rt_put(rt);
1479                         ret = NULL;
1480                 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
1481                            rt->u.dst.expires) {
1482                         unsigned hash = rt_hash(rt->fl.fl4_dst, rt->fl.fl4_src,
1483                                                 rt->fl.oif,
1484                                                 rt_genid(dev_net(dst->dev)));
1485 #if RT_CACHE_DEBUG >= 1
1486                         printk(KERN_DEBUG "ipv4_negative_advice: redirect to %pI4/%02x dropped\n",
1487                                 &rt->rt_dst, rt->fl.fl4_tos);
1488 #endif
1489                         rt_del(hash, rt);
1490                         ret = NULL;
1491                 }
1492         }
1493         return ret;
1494 }
1495
1496 /*
1497  * Algorithm:
1498  *      1. The first ip_rt_redirect_number redirects are sent
1499  *         with exponential backoff, then we stop sending them at all,
1500  *         assuming that the host ignores our redirects.
1501  *      2. If we did not see packets requiring redirects
1502  *         during ip_rt_redirect_silence, we assume that the host
1503  *         forgot redirected route and start to send redirects again.
1504  *
1505  * This algorithm is much cheaper and more intelligent than dumb load limiting
1506  * in icmp.c.
1507  *
1508  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1509  * and "frag. need" (breaks PMTU discovery) in icmp.c.
1510  */
1511
1512 void ip_rt_send_redirect(struct sk_buff *skb)
1513 {
1514         struct rtable *rt = skb_rtable(skb);
1515         struct in_device *in_dev;
1516         int log_martians;
1517
1518         rcu_read_lock();
1519         in_dev = __in_dev_get_rcu(rt->u.dst.dev);
1520         if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
1521                 rcu_read_unlock();
1522                 return;
1523         }
1524         log_martians = IN_DEV_LOG_MARTIANS(in_dev);
1525         rcu_read_unlock();
1526
1527         /* No redirected packets during ip_rt_redirect_silence;
1528          * reset the algorithm.
1529          */
1530         if (time_after(jiffies, rt->u.dst.rate_last + ip_rt_redirect_silence))
1531                 rt->u.dst.rate_tokens = 0;
1532
1533         /* Too many ignored redirects; do not send anything
1534          * set u.dst.rate_last to the last seen redirected packet.
1535          */
1536         if (rt->u.dst.rate_tokens >= ip_rt_redirect_number) {
1537                 rt->u.dst.rate_last = jiffies;
1538                 return;
1539         }
1540
1541         /* Check for load limit; set rate_last to the latest sent
1542          * redirect.
1543          */
1544         if (rt->u.dst.rate_tokens == 0 ||
1545             time_after(jiffies,
1546                        (rt->u.dst.rate_last +
1547                         (ip_rt_redirect_load << rt->u.dst.rate_tokens)))) {
1548                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1549                 rt->u.dst.rate_last = jiffies;
1550                 ++rt->u.dst.rate_tokens;
1551 #ifdef CONFIG_IP_ROUTE_VERBOSE
1552                 if (log_martians &&
1553                     rt->u.dst.rate_tokens == ip_rt_redirect_number &&
1554                     net_ratelimit())
1555                         printk(KERN_WARNING "host %pI4/if%d ignores redirects for %pI4 to %pI4.\n",
1556                                 &rt->rt_src, rt->rt_iif,
1557                                 &rt->rt_dst, &rt->rt_gateway);
1558 #endif
1559         }
1560 }
1561
1562 static int ip_error(struct sk_buff *skb)
1563 {
1564         struct rtable *rt = skb_rtable(skb);
1565         unsigned long now;
1566         int code;
1567
1568         switch (rt->u.dst.error) {
1569                 case EINVAL:
1570                 default:
1571                         goto out;
1572                 case EHOSTUNREACH:
1573                         code = ICMP_HOST_UNREACH;
1574                         break;
1575                 case ENETUNREACH:
1576                         code = ICMP_NET_UNREACH;
1577                         IP_INC_STATS_BH(dev_net(rt->u.dst.dev),
1578                                         IPSTATS_MIB_INNOROUTES);
1579                         break;
1580                 case EACCES:
1581                         code = ICMP_PKT_FILTERED;
1582                         break;
1583         }
1584
1585         now = jiffies;
1586         rt->u.dst.rate_tokens += now - rt->u.dst.rate_last;
1587         if (rt->u.dst.rate_tokens > ip_rt_error_burst)
1588                 rt->u.dst.rate_tokens = ip_rt_error_burst;
1589         rt->u.dst.rate_last = now;
1590         if (rt->u.dst.rate_tokens >= ip_rt_error_cost) {
1591                 rt->u.dst.rate_tokens -= ip_rt_error_cost;
1592                 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1593         }
1594
1595 out:    kfree_skb(skb);
1596         return 0;
1597 }
1598
1599 /*
1600  *      The last two values are not from the RFC but
1601  *      are needed for AMPRnet AX.25 paths.
1602  */
1603
1604 static const unsigned short mtu_plateau[] =
1605 {32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1606
1607 static inline unsigned short guess_mtu(unsigned short old_mtu)
1608 {
1609         int i;
1610
1611         for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
1612                 if (old_mtu > mtu_plateau[i])
1613                         return mtu_plateau[i];
1614         return 68;
1615 }
1616
1617 unsigned short ip_rt_frag_needed(struct net *net, struct iphdr *iph,
1618                                  unsigned short new_mtu,
1619                                  struct net_device *dev)
1620 {
1621         int i, k;
1622         unsigned short old_mtu = ntohs(iph->tot_len);
1623         struct rtable *rth;
1624         int  ikeys[2] = { dev->ifindex, 0 };
1625         __be32  skeys[2] = { iph->saddr, 0, };
1626         __be32  daddr = iph->daddr;
1627         unsigned short est_mtu = 0;
1628
1629         for (k = 0; k < 2; k++) {
1630                 for (i = 0; i < 2; i++) {
1631                         unsigned hash = rt_hash(daddr, skeys[i], ikeys[k],
1632                                                 rt_genid(net));
1633
1634                         rcu_read_lock();
1635                         for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
1636                              rth = rcu_dereference(rth->u.dst.rt_next)) {
1637                                 unsigned short mtu = new_mtu;
1638
1639                                 if (rth->fl.fl4_dst != daddr ||
1640                                     rth->fl.fl4_src != skeys[i] ||
1641                                     rth->rt_dst != daddr ||
1642                                     rth->rt_src != iph->saddr ||
1643                                     rth->fl.oif != ikeys[k] ||
1644                                     rth->fl.iif != 0 ||
1645                                     dst_metric_locked(&rth->u.dst, RTAX_MTU) ||
1646                                     !net_eq(dev_net(rth->u.dst.dev), net) ||
1647                                     rt_is_expired(rth))
1648                                         continue;
1649
1650                                 if (new_mtu < 68 || new_mtu >= old_mtu) {
1651
1652                                         /* BSD 4.2 compatibility hack :-( */
1653                                         if (mtu == 0 &&
1654                                             old_mtu >= dst_mtu(&rth->u.dst) &&
1655                                             old_mtu >= 68 + (iph->ihl << 2))
1656                                                 old_mtu -= iph->ihl << 2;
1657
1658                                         mtu = guess_mtu(old_mtu);
1659                                 }
1660                                 if (mtu <= dst_mtu(&rth->u.dst)) {
1661                                         if (mtu < dst_mtu(&rth->u.dst)) {
1662                                                 dst_confirm(&rth->u.dst);
1663                                                 if (mtu < ip_rt_min_pmtu) {
1664                                                         mtu = ip_rt_min_pmtu;
1665                                                         rth->u.dst.metrics[RTAX_LOCK-1] |=
1666                                                                 (1 << RTAX_MTU);
1667                                                 }
1668                                                 rth->u.dst.metrics[RTAX_MTU-1] = mtu;
1669                                                 dst_set_expires(&rth->u.dst,
1670                                                         ip_rt_mtu_expires);
1671                                         }
1672                                         est_mtu = mtu;
1673                                 }
1674                         }
1675                         rcu_read_unlock();
1676                 }
1677         }
1678         return est_mtu ? : new_mtu;
1679 }
1680
1681 static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1682 {
1683         if (dst_mtu(dst) > mtu && mtu >= 68 &&
1684             !(dst_metric_locked(dst, RTAX_MTU))) {
1685                 if (mtu < ip_rt_min_pmtu) {
1686                         mtu = ip_rt_min_pmtu;
1687                         dst->metrics[RTAX_LOCK-1] |= (1 << RTAX_MTU);
1688                 }
1689                 dst->metrics[RTAX_MTU-1] = mtu;
1690                 dst_set_expires(dst, ip_rt_mtu_expires);
1691                 call_netevent_notifiers(NETEVENT_PMTU_UPDATE, dst);
1692         }
1693 }
1694
1695 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1696 {
1697         return NULL;
1698 }
1699
1700 static void ipv4_dst_destroy(struct dst_entry *dst)
1701 {
1702         struct rtable *rt = (struct rtable *) dst;
1703         struct inet_peer *peer = rt->peer;
1704         struct in_device *idev = rt->idev;
1705
1706         if (peer) {
1707                 rt->peer = NULL;
1708                 inet_putpeer(peer);
1709         }
1710
1711         if (idev) {
1712                 rt->idev = NULL;
1713                 in_dev_put(idev);
1714         }
1715 }
1716
1717 static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
1718                             int how)
1719 {
1720         struct rtable *rt = (struct rtable *) dst;
1721         struct in_device *idev = rt->idev;
1722         if (dev != dev_net(dev)->loopback_dev && idev && idev->dev == dev) {
1723                 struct in_device *loopback_idev =
1724                         in_dev_get(dev_net(dev)->loopback_dev);
1725                 if (loopback_idev) {
1726                         rt->idev = loopback_idev;
1727                         in_dev_put(idev);
1728                 }
1729         }
1730 }
1731
1732 static void ipv4_link_failure(struct sk_buff *skb)
1733 {
1734         struct rtable *rt;
1735
1736         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1737
1738         rt = skb_rtable(skb);
1739         if (rt)
1740                 dst_set_expires(&rt->u.dst, 0);
1741 }
1742
1743 static int ip_rt_bug(struct sk_buff *skb)
1744 {
1745         printk(KERN_DEBUG "ip_rt_bug: %pI4 -> %pI4, %s\n",
1746                 &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1747                 skb->dev ? skb->dev->name : "?");
1748         kfree_skb(skb);
1749         return 0;
1750 }
1751
1752 /*
1753    We do not cache source address of outgoing interface,
1754    because it is used only by IP RR, TS and SRR options,
1755    so that it out of fast path.
1756
1757    BTW remember: "addr" is allowed to be not aligned
1758    in IP options!
1759  */
1760
1761 void ip_rt_get_source(u8 *addr, struct rtable *rt)
1762 {
1763         __be32 src;
1764         struct fib_result res;
1765
1766         if (rt->fl.iif == 0)
1767                 src = rt->rt_src;
1768         else if (fib_lookup(dev_net(rt->u.dst.dev), &rt->fl, &res) == 0) {
1769                 src = FIB_RES_PREFSRC(res);
1770                 fib_res_put(&res);
1771         } else
1772                 src = inet_select_addr(rt->u.dst.dev, rt->rt_gateway,
1773                                         RT_SCOPE_UNIVERSE);
1774         memcpy(addr, &src, 4);
1775 }
1776
1777 #ifdef CONFIG_NET_CLS_ROUTE
1778 static void set_class_tag(struct rtable *rt, u32 tag)
1779 {
1780         if (!(rt->u.dst.tclassid & 0xFFFF))
1781                 rt->u.dst.tclassid |= tag & 0xFFFF;
1782         if (!(rt->u.dst.tclassid & 0xFFFF0000))
1783                 rt->u.dst.tclassid |= tag & 0xFFFF0000;
1784 }
1785 #endif
1786
1787 static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag)
1788 {
1789         struct fib_info *fi = res->fi;
1790
1791         if (fi) {
1792                 if (FIB_RES_GW(*res) &&
1793                     FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1794                         rt->rt_gateway = FIB_RES_GW(*res);
1795                 memcpy(rt->u.dst.metrics, fi->fib_metrics,
1796                        sizeof(rt->u.dst.metrics));
1797                 if (fi->fib_mtu == 0) {
1798                         rt->u.dst.metrics[RTAX_MTU-1] = rt->u.dst.dev->mtu;
1799                         if (dst_metric_locked(&rt->u.dst, RTAX_MTU) &&
1800                             rt->rt_gateway != rt->rt_dst &&
1801                             rt->u.dst.dev->mtu > 576)
1802                                 rt->u.dst.metrics[RTAX_MTU-1] = 576;
1803                 }
1804 #ifdef CONFIG_NET_CLS_ROUTE
1805                 rt->u.dst.tclassid = FIB_RES_NH(*res).nh_tclassid;
1806 #endif
1807         } else
1808                 rt->u.dst.metrics[RTAX_MTU-1]= rt->u.dst.dev->mtu;
1809
1810         if (dst_metric(&rt->u.dst, RTAX_HOPLIMIT) == 0)
1811                 rt->u.dst.metrics[RTAX_HOPLIMIT-1] = sysctl_ip_default_ttl;
1812         if (dst_mtu(&rt->u.dst) > IP_MAX_MTU)
1813                 rt->u.dst.metrics[RTAX_MTU-1] = IP_MAX_MTU;
1814         if (dst_metric(&rt->u.dst, RTAX_ADVMSS) == 0)
1815                 rt->u.dst.metrics[RTAX_ADVMSS-1] = max_t(unsigned int, rt->u.dst.dev->mtu - 40,
1816                                        ip_rt_min_advmss);
1817         if (dst_metric(&rt->u.dst, RTAX_ADVMSS) > 65535 - 40)
1818                 rt->u.dst.metrics[RTAX_ADVMSS-1] = 65535 - 40;
1819
1820 #ifdef CONFIG_NET_CLS_ROUTE
1821 #ifdef CONFIG_IP_MULTIPLE_TABLES
1822         set_class_tag(rt, fib_rules_tclass(res));
1823 #endif
1824         set_class_tag(rt, itag);
1825 #endif
1826         rt->rt_type = res->type;
1827 }
1828
1829 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1830                                 u8 tos, struct net_device *dev, int our)
1831 {
1832         unsigned hash;
1833         struct rtable *rth;
1834         __be32 spec_dst;
1835         struct in_device *in_dev = in_dev_get(dev);
1836         u32 itag = 0;
1837
1838         /* Primary sanity checks. */
1839
1840         if (in_dev == NULL)
1841                 return -EINVAL;
1842
1843         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1844             ipv4_is_loopback(saddr) || skb->protocol != htons(ETH_P_IP))
1845                 goto e_inval;
1846
1847         if (ipv4_is_zeronet(saddr)) {
1848                 if (!ipv4_is_local_multicast(daddr))
1849                         goto e_inval;
1850                 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1851         } else if (fib_validate_source(saddr, 0, tos, 0,
1852                                         dev, &spec_dst, &itag, 0) < 0)
1853                 goto e_inval;
1854
1855         rth = dst_alloc(&ipv4_dst_ops);
1856         if (!rth)
1857                 goto e_nobufs;
1858
1859         rth->u.dst.output= ip_rt_bug;
1860
1861         atomic_set(&rth->u.dst.__refcnt, 1);
1862         rth->u.dst.flags= DST_HOST;
1863         if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
1864                 rth->u.dst.flags |= DST_NOPOLICY;
1865         rth->fl.fl4_dst = daddr;
1866         rth->rt_dst     = daddr;
1867         rth->fl.fl4_tos = tos;
1868         rth->fl.mark    = skb->mark;
1869         rth->fl.fl4_src = saddr;
1870         rth->rt_src     = saddr;
1871 #ifdef CONFIG_NET_CLS_ROUTE
1872         rth->u.dst.tclassid = itag;
1873 #endif
1874         rth->rt_iif     =
1875         rth->fl.iif     = dev->ifindex;
1876         rth->u.dst.dev  = init_net.loopback_dev;
1877         dev_hold(rth->u.dst.dev);
1878         rth->idev       = in_dev_get(rth->u.dst.dev);
1879         rth->fl.oif     = 0;
1880         rth->rt_gateway = daddr;
1881         rth->rt_spec_dst= spec_dst;
1882         rth->rt_genid   = rt_genid(dev_net(dev));
1883         rth->rt_flags   = RTCF_MULTICAST;
1884         rth->rt_type    = RTN_MULTICAST;
1885         if (our) {
1886                 rth->u.dst.input= ip_local_deliver;
1887                 rth->rt_flags |= RTCF_LOCAL;
1888         }
1889
1890 #ifdef CONFIG_IP_MROUTE
1891         if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1892                 rth->u.dst.input = ip_mr_input;
1893 #endif
1894         RT_CACHE_STAT_INC(in_slow_mc);
1895
1896         in_dev_put(in_dev);
1897         hash = rt_hash(daddr, saddr, dev->ifindex, rt_genid(dev_net(dev)));
1898         return rt_intern_hash(hash, rth, NULL, skb);
1899
1900 e_nobufs:
1901         in_dev_put(in_dev);
1902         return -ENOBUFS;
1903
1904 e_inval:
1905         in_dev_put(in_dev);
1906         return -EINVAL;
1907 }
1908
1909
1910 static void ip_handle_martian_source(struct net_device *dev,
1911                                      struct in_device *in_dev,
1912                                      struct sk_buff *skb,
1913                                      __be32 daddr,
1914                                      __be32 saddr)
1915 {
1916         RT_CACHE_STAT_INC(in_martian_src);
1917 #ifdef CONFIG_IP_ROUTE_VERBOSE
1918         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1919                 /*
1920                  *      RFC1812 recommendation, if source is martian,
1921                  *      the only hint is MAC header.
1922                  */
1923                 printk(KERN_WARNING "martian source %pI4 from %pI4, on dev %s\n",
1924                         &daddr, &saddr, dev->name);
1925                 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1926                         int i;
1927                         const unsigned char *p = skb_mac_header(skb);
1928                         printk(KERN_WARNING "ll header: ");
1929                         for (i = 0; i < dev->hard_header_len; i++, p++) {
1930                                 printk("%02x", *p);
1931                                 if (i < (dev->hard_header_len - 1))
1932                                         printk(":");
1933                         }
1934                         printk("\n");
1935                 }
1936         }
1937 #endif
1938 }
1939
1940 static int __mkroute_input(struct sk_buff *skb,
1941                            struct fib_result *res,
1942                            struct in_device *in_dev,
1943                            __be32 daddr, __be32 saddr, u32 tos,
1944                            struct rtable **result)
1945 {
1946
1947         struct rtable *rth;
1948         int err;
1949         struct in_device *out_dev;
1950         unsigned flags = 0;
1951         __be32 spec_dst;
1952         u32 itag;
1953
1954         /* get a working reference to the output device */
1955         out_dev = in_dev_get(FIB_RES_DEV(*res));
1956         if (out_dev == NULL) {
1957                 if (net_ratelimit())
1958                         printk(KERN_CRIT "Bug in ip_route_input" \
1959                                "_slow(). Please, report\n");
1960                 return -EINVAL;
1961         }
1962
1963
1964         err = fib_validate_source(saddr, daddr, tos, FIB_RES_OIF(*res),
1965                                   in_dev->dev, &spec_dst, &itag, skb->mark);
1966         if (err < 0) {
1967                 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1968                                          saddr);
1969
1970                 err = -EINVAL;
1971                 goto cleanup;
1972         }
1973
1974         if (err)
1975                 flags |= RTCF_DIRECTSRC;
1976
1977         if (out_dev == in_dev && err &&
1978             (IN_DEV_SHARED_MEDIA(out_dev) ||
1979              inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
1980                 flags |= RTCF_DOREDIRECT;
1981
1982         if (skb->protocol != htons(ETH_P_IP)) {
1983                 /* Not IP (i.e. ARP). Do not create route, if it is
1984                  * invalid for proxy arp. DNAT routes are always valid.
1985                  */
1986                 if (out_dev == in_dev) {
1987                         err = -EINVAL;
1988                         goto cleanup;
1989                 }
1990         }
1991
1992
1993         rth = dst_alloc(&ipv4_dst_ops);
1994         if (!rth) {
1995                 err = -ENOBUFS;
1996                 goto cleanup;
1997         }
1998
1999         atomic_set(&rth->u.dst.__refcnt, 1);
2000         rth->u.dst.flags= DST_HOST;
2001         if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
2002                 rth->u.dst.flags |= DST_NOPOLICY;
2003         if (IN_DEV_CONF_GET(out_dev, NOXFRM))
2004                 rth->u.dst.flags |= DST_NOXFRM;
2005         rth->fl.fl4_dst = daddr;
2006         rth->rt_dst     = daddr;
2007         rth->fl.fl4_tos = tos;
2008         rth->fl.mark    = skb->mark;
2009         rth->fl.fl4_src = saddr;
2010         rth->rt_src     = saddr;
2011         rth->rt_gateway = daddr;
2012         rth->rt_iif     =
2013                 rth->fl.iif     = in_dev->dev->ifindex;
2014         rth->u.dst.dev  = (out_dev)->dev;
2015         dev_hold(rth->u.dst.dev);
2016         rth->idev       = in_dev_get(rth->u.dst.dev);
2017         rth->fl.oif     = 0;
2018         rth->rt_spec_dst= spec_dst;
2019
2020         rth->u.dst.input = ip_forward;
2021         rth->u.dst.output = ip_output;
2022         rth->rt_genid = rt_genid(dev_net(rth->u.dst.dev));
2023
2024         rt_set_nexthop(rth, res, itag);
2025
2026         rth->rt_flags = flags;
2027
2028         *result = rth;
2029         err = 0;
2030  cleanup:
2031         /* release the working reference to the output device */
2032         in_dev_put(out_dev);
2033         return err;
2034 }
2035
2036 static int ip_mkroute_input(struct sk_buff *skb,
2037                             struct fib_result *res,
2038                             const struct flowi *fl,
2039                             struct in_device *in_dev,
2040                             __be32 daddr, __be32 saddr, u32 tos)
2041 {
2042         struct rtable* rth = NULL;
2043         int err;
2044         unsigned hash;
2045
2046 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2047         if (res->fi && res->fi->fib_nhs > 1 && fl->oif == 0)
2048                 fib_select_multipath(fl, res);
2049 #endif
2050
2051         /* create a routing cache entry */
2052         err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
2053         if (err)
2054                 return err;
2055
2056         /* put it into the cache */
2057         hash = rt_hash(daddr, saddr, fl->iif,
2058                        rt_genid(dev_net(rth->u.dst.dev)));
2059         return rt_intern_hash(hash, rth, NULL, skb);
2060 }
2061
2062 /*
2063  *      NOTE. We drop all the packets that has local source
2064  *      addresses, because every properly looped back packet
2065  *      must have correct destination already attached by output routine.
2066  *
2067  *      Such approach solves two big problems:
2068  *      1. Not simplex devices are handled properly.
2069  *      2. IP spoofing attempts are filtered with 100% of guarantee.
2070  */
2071
2072 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2073                                u8 tos, struct net_device *dev)
2074 {
2075         struct fib_result res;
2076         struct in_device *in_dev = in_dev_get(dev);
2077         struct flowi fl = { .nl_u = { .ip4_u =
2078                                       { .daddr = daddr,
2079                                         .saddr = saddr,
2080                                         .tos = tos,
2081                                         .scope = RT_SCOPE_UNIVERSE,
2082                                       } },
2083                             .mark = skb->mark,
2084                             .iif = dev->ifindex };
2085         unsigned        flags = 0;
2086         u32             itag = 0;
2087         struct rtable * rth;
2088         unsigned        hash;
2089         __be32          spec_dst;
2090         int             err = -EINVAL;
2091         int             free_res = 0;
2092         struct net    * net = dev_net(dev);
2093
2094         /* IP on this device is disabled. */
2095
2096         if (!in_dev)
2097                 goto out;
2098
2099         /* Check for the most weird martians, which can be not detected
2100            by fib_lookup.
2101          */
2102
2103         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
2104             ipv4_is_loopback(saddr))
2105                 goto martian_source;
2106
2107         if (daddr == htonl(0xFFFFFFFF) || (saddr == 0 && daddr == 0))
2108                 goto brd_input;
2109
2110         /* Accept zero addresses only to limited broadcast;
2111          * I even do not know to fix it or not. Waiting for complains :-)
2112          */
2113         if (ipv4_is_zeronet(saddr))
2114                 goto martian_source;
2115
2116         if (ipv4_is_lbcast(daddr) || ipv4_is_zeronet(daddr) ||
2117             ipv4_is_loopback(daddr))
2118                 goto martian_destination;
2119
2120         /*
2121          *      Now we are ready to route packet.
2122          */
2123         if ((err = fib_lookup(net, &fl, &res)) != 0) {
2124                 if (!IN_DEV_FORWARD(in_dev))
2125                         goto e_hostunreach;
2126                 goto no_route;
2127         }
2128         free_res = 1;
2129
2130         RT_CACHE_STAT_INC(in_slow_tot);
2131
2132         if (res.type == RTN_BROADCAST)
2133                 goto brd_input;
2134
2135         if (res.type == RTN_LOCAL) {
2136                 int result;
2137                 result = fib_validate_source(saddr, daddr, tos,
2138                                              net->loopback_dev->ifindex,
2139                                              dev, &spec_dst, &itag, skb->mark);
2140                 if (result < 0)
2141                         goto martian_source;
2142                 if (result)
2143                         flags |= RTCF_DIRECTSRC;
2144                 spec_dst = daddr;
2145                 goto local_input;
2146         }
2147
2148         if (!IN_DEV_FORWARD(in_dev))
2149                 goto e_hostunreach;
2150         if (res.type != RTN_UNICAST)
2151                 goto martian_destination;
2152
2153         err = ip_mkroute_input(skb, &res, &fl, in_dev, daddr, saddr, tos);
2154 done:
2155         in_dev_put(in_dev);
2156         if (free_res)
2157                 fib_res_put(&res);
2158 out:    return err;
2159
2160 brd_input:
2161         if (skb->protocol != htons(ETH_P_IP))
2162                 goto e_inval;
2163
2164         if (ipv4_is_zeronet(saddr))
2165                 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
2166         else {
2167                 err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst,
2168                                           &itag, skb->mark);
2169                 if (err < 0)
2170                         goto martian_source;
2171                 if (err)
2172                         flags |= RTCF_DIRECTSRC;
2173         }
2174         flags |= RTCF_BROADCAST;
2175         res.type = RTN_BROADCAST;
2176         RT_CACHE_STAT_INC(in_brd);
2177
2178 local_input:
2179         rth = dst_alloc(&ipv4_dst_ops);
2180         if (!rth)
2181                 goto e_nobufs;
2182
2183         rth->u.dst.output= ip_rt_bug;
2184         rth->rt_genid = rt_genid(net);
2185
2186         atomic_set(&rth->u.dst.__refcnt, 1);
2187         rth->u.dst.flags= DST_HOST;
2188         if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
2189                 rth->u.dst.flags |= DST_NOPOLICY;
2190         rth->fl.fl4_dst = daddr;
2191         rth->rt_dst     = daddr;
2192         rth->fl.fl4_tos = tos;
2193         rth->fl.mark    = skb->mark;
2194         rth->fl.fl4_src = saddr;
2195         rth->rt_src     = saddr;
2196 #ifdef CONFIG_NET_CLS_ROUTE
2197         rth->u.dst.tclassid = itag;
2198 #endif
2199         rth->rt_iif     =
2200         rth->fl.iif     = dev->ifindex;
2201         rth->u.dst.dev  = net->loopback_dev;
2202         dev_hold(rth->u.dst.dev);
2203         rth->idev       = in_dev_get(rth->u.dst.dev);
2204         rth->rt_gateway = daddr;
2205         rth->rt_spec_dst= spec_dst;
2206         rth->u.dst.input= ip_local_deliver;
2207         rth->rt_flags   = flags|RTCF_LOCAL;
2208         if (res.type == RTN_UNREACHABLE) {
2209                 rth->u.dst.input= ip_error;
2210                 rth->u.dst.error= -err;
2211                 rth->rt_flags   &= ~RTCF_LOCAL;
2212         }
2213         rth->rt_type    = res.type;
2214         hash = rt_hash(daddr, saddr, fl.iif, rt_genid(net));
2215         err = rt_intern_hash(hash, rth, NULL, skb);
2216         goto done;
2217
2218 no_route:
2219         RT_CACHE_STAT_INC(in_no_route);
2220         spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
2221         res.type = RTN_UNREACHABLE;
2222         if (err == -ESRCH)
2223                 err = -ENETUNREACH;
2224         goto local_input;
2225
2226         /*
2227          *      Do not cache martian addresses: they should be logged (RFC1812)
2228          */
2229 martian_destination:
2230         RT_CACHE_STAT_INC(in_martian_dst);
2231 #ifdef CONFIG_IP_ROUTE_VERBOSE
2232         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
2233                 printk(KERN_WARNING "martian destination %pI4 from %pI4, dev %s\n",
2234                         &daddr, &saddr, dev->name);
2235 #endif
2236
2237 e_hostunreach:
2238         err = -EHOSTUNREACH;
2239         goto done;
2240
2241 e_inval:
2242         err = -EINVAL;
2243         goto done;
2244
2245 e_nobufs:
2246         err = -ENOBUFS;
2247         goto done;
2248
2249 martian_source:
2250         ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2251         goto e_inval;
2252 }
2253
2254 int ip_route_input(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2255                    u8 tos, struct net_device *dev)
2256 {
2257         struct rtable * rth;
2258         unsigned        hash;
2259         int iif = dev->ifindex;
2260         struct net *net;
2261
2262         net = dev_net(dev);
2263
2264         if (!rt_caching(net))
2265                 goto skip_cache;
2266
2267         tos &= IPTOS_RT_MASK;
2268         hash = rt_hash(daddr, saddr, iif, rt_genid(net));
2269
2270         rcu_read_lock();
2271         for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2272              rth = rcu_dereference(rth->u.dst.rt_next)) {
2273                 if (((rth->fl.fl4_dst ^ daddr) |
2274                      (rth->fl.fl4_src ^ saddr) |
2275                      (rth->fl.iif ^ iif) |
2276                      rth->fl.oif |
2277                      (rth->fl.fl4_tos ^ tos)) == 0 &&
2278                     rth->fl.mark == skb->mark &&
2279                     net_eq(dev_net(rth->u.dst.dev), net) &&
2280                     !rt_is_expired(rth)) {
2281                         dst_use(&rth->u.dst, jiffies);
2282                         RT_CACHE_STAT_INC(in_hit);
2283                         rcu_read_unlock();
2284                         skb_dst_set(skb, &rth->u.dst);
2285                         return 0;
2286                 }
2287                 RT_CACHE_STAT_INC(in_hlist_search);
2288         }
2289         rcu_read_unlock();
2290
2291 skip_cache:
2292         /* Multicast recognition logic is moved from route cache to here.
2293            The problem was that too many Ethernet cards have broken/missing
2294            hardware multicast filters :-( As result the host on multicasting
2295            network acquires a lot of useless route cache entries, sort of
2296            SDR messages from all the world. Now we try to get rid of them.
2297            Really, provided software IP multicast filter is organized
2298            reasonably (at least, hashed), it does not result in a slowdown
2299            comparing with route cache reject entries.
2300            Note, that multicast routers are not affected, because
2301            route cache entry is created eventually.
2302          */
2303         if (ipv4_is_multicast(daddr)) {
2304                 struct in_device *in_dev;
2305
2306                 rcu_read_lock();
2307                 if ((in_dev = __in_dev_get_rcu(dev)) != NULL) {
2308                         int our = ip_check_mc(in_dev, daddr, saddr,
2309                                 ip_hdr(skb)->protocol);
2310                         if (our
2311 #ifdef CONFIG_IP_MROUTE
2312                                 ||
2313                             (!ipv4_is_local_multicast(daddr) &&
2314                              IN_DEV_MFORWARD(in_dev))
2315 #endif
2316                            ) {
2317                                 rcu_read_unlock();
2318                                 return ip_route_input_mc(skb, daddr, saddr,
2319                                                          tos, dev, our);
2320                         }
2321                 }
2322                 rcu_read_unlock();
2323                 return -EINVAL;
2324         }
2325         return ip_route_input_slow(skb, daddr, saddr, tos, dev);
2326 }
2327
2328 static int __mkroute_output(struct rtable **result,
2329                             struct fib_result *res,
2330                             const struct flowi *fl,
2331                             const struct flowi *oldflp,
2332                             struct net_device *dev_out,
2333                             unsigned flags)
2334 {
2335         struct rtable *rth;
2336         struct in_device *in_dev;
2337         u32 tos = RT_FL_TOS(oldflp);
2338         int err = 0;
2339
2340         if (ipv4_is_loopback(fl->fl4_src) && !(dev_out->flags&IFF_LOOPBACK))
2341                 return -EINVAL;
2342
2343         if (fl->fl4_dst == htonl(0xFFFFFFFF))
2344                 res->type = RTN_BROADCAST;
2345         else if (ipv4_is_multicast(fl->fl4_dst))
2346                 res->type = RTN_MULTICAST;
2347         else if (ipv4_is_lbcast(fl->fl4_dst) || ipv4_is_zeronet(fl->fl4_dst))
2348                 return -EINVAL;
2349
2350         if (dev_out->flags & IFF_LOOPBACK)
2351                 flags |= RTCF_LOCAL;
2352
2353         /* get work reference to inet device */
2354         in_dev = in_dev_get(dev_out);
2355         if (!in_dev)
2356                 return -EINVAL;
2357
2358         if (res->type == RTN_BROADCAST) {
2359                 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2360                 if (res->fi) {
2361                         fib_info_put(res->fi);
2362                         res->fi = NULL;
2363                 }
2364         } else if (res->type == RTN_MULTICAST) {
2365                 flags |= RTCF_MULTICAST|RTCF_LOCAL;
2366                 if (!ip_check_mc(in_dev, oldflp->fl4_dst, oldflp->fl4_src,
2367                                  oldflp->proto))
2368                         flags &= ~RTCF_LOCAL;
2369                 /* If multicast route do not exist use
2370                    default one, but do not gateway in this case.
2371                    Yes, it is hack.
2372                  */
2373                 if (res->fi && res->prefixlen < 4) {
2374                         fib_info_put(res->fi);
2375                         res->fi = NULL;
2376                 }
2377         }
2378
2379
2380         rth = dst_alloc(&ipv4_dst_ops);
2381         if (!rth) {
2382                 err = -ENOBUFS;
2383                 goto cleanup;
2384         }
2385
2386         atomic_set(&rth->u.dst.__refcnt, 1);
2387         rth->u.dst.flags= DST_HOST;
2388         if (IN_DEV_CONF_GET(in_dev, NOXFRM))
2389                 rth->u.dst.flags |= DST_NOXFRM;
2390         if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
2391                 rth->u.dst.flags |= DST_NOPOLICY;
2392
2393         rth->fl.fl4_dst = oldflp->fl4_dst;
2394         rth->fl.fl4_tos = tos;
2395         rth->fl.fl4_src = oldflp->fl4_src;
2396         rth->fl.oif     = oldflp->oif;
2397         rth->fl.mark    = oldflp->mark;
2398         rth->rt_dst     = fl->fl4_dst;
2399         rth->rt_src     = fl->fl4_src;
2400         rth->rt_iif     = oldflp->oif ? : dev_out->ifindex;
2401         /* get references to the devices that are to be hold by the routing
2402            cache entry */
2403         rth->u.dst.dev  = dev_out;
2404         dev_hold(dev_out);
2405         rth->idev       = in_dev_get(dev_out);
2406         rth->rt_gateway = fl->fl4_dst;
2407         rth->rt_spec_dst= fl->fl4_src;
2408
2409         rth->u.dst.output=ip_output;
2410         rth->rt_genid = rt_genid(dev_net(dev_out));
2411
2412         RT_CACHE_STAT_INC(out_slow_tot);
2413
2414         if (flags & RTCF_LOCAL) {
2415                 rth->u.dst.input = ip_local_deliver;
2416                 rth->rt_spec_dst = fl->fl4_dst;
2417         }
2418         if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2419                 rth->rt_spec_dst = fl->fl4_src;
2420                 if (flags & RTCF_LOCAL &&
2421                     !(dev_out->flags & IFF_LOOPBACK)) {
2422                         rth->u.dst.output = ip_mc_output;
2423                         RT_CACHE_STAT_INC(out_slow_mc);
2424                 }
2425 #ifdef CONFIG_IP_MROUTE
2426                 if (res->type == RTN_MULTICAST) {
2427                         if (IN_DEV_MFORWARD(in_dev) &&
2428                             !ipv4_is_local_multicast(oldflp->fl4_dst)) {
2429                                 rth->u.dst.input = ip_mr_input;
2430                                 rth->u.dst.output = ip_mc_output;
2431                         }
2432                 }
2433 #endif
2434         }
2435
2436         rt_set_nexthop(rth, res, 0);
2437
2438         rth->rt_flags = flags;
2439
2440         *result = rth;
2441  cleanup:
2442         /* release work reference to inet device */
2443         in_dev_put(in_dev);
2444
2445         return err;
2446 }
2447
2448 static int ip_mkroute_output(struct rtable **rp,
2449                              struct fib_result *res,
2450                              const struct flowi *fl,
2451                              const struct flowi *oldflp,
2452                              struct net_device *dev_out,
2453                              unsigned flags)
2454 {
2455         struct rtable *rth = NULL;
2456         int err = __mkroute_output(&rth, res, fl, oldflp, dev_out, flags);
2457         unsigned hash;
2458         if (err == 0) {
2459                 hash = rt_hash(oldflp->fl4_dst, oldflp->fl4_src, oldflp->oif,
2460                                rt_genid(dev_net(dev_out)));
2461                 err = rt_intern_hash(hash, rth, rp, NULL);
2462         }
2463
2464         return err;
2465 }
2466
2467 /*
2468  * Major route resolver routine.
2469  */
2470
2471 static int ip_route_output_slow(struct net *net, struct rtable **rp,
2472                                 const struct flowi *oldflp)
2473 {
2474         u32 tos = RT_FL_TOS(oldflp);
2475         struct flowi fl = { .nl_u = { .ip4_u =
2476                                       { .daddr = oldflp->fl4_dst,
2477                                         .saddr = oldflp->fl4_src,
2478                                         .tos = tos & IPTOS_RT_MASK,
2479                                         .scope = ((tos & RTO_ONLINK) ?
2480                                                   RT_SCOPE_LINK :
2481                                                   RT_SCOPE_UNIVERSE),
2482                                       } },
2483                             .mark = oldflp->mark,
2484                             .iif = net->loopback_dev->ifindex,
2485                             .oif = oldflp->oif };
2486         struct fib_result res;
2487         unsigned flags = 0;
2488         struct net_device *dev_out = NULL;
2489         int free_res = 0;
2490         int err;
2491
2492
2493         res.fi          = NULL;
2494 #ifdef CONFIG_IP_MULTIPLE_TABLES
2495         res.r           = NULL;
2496 #endif
2497
2498         if (oldflp->fl4_src) {
2499                 err = -EINVAL;
2500                 if (ipv4_is_multicast(oldflp->fl4_src) ||
2501                     ipv4_is_lbcast(oldflp->fl4_src) ||
2502                     ipv4_is_zeronet(oldflp->fl4_src))
2503                         goto out;
2504
2505                 /* I removed check for oif == dev_out->oif here.
2506                    It was wrong for two reasons:
2507                    1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2508                       is assigned to multiple interfaces.
2509                    2. Moreover, we are allowed to send packets with saddr
2510                       of another iface. --ANK
2511                  */
2512
2513                 if (oldflp->oif == 0 &&
2514                     (ipv4_is_multicast(oldflp->fl4_dst) ||
2515                      oldflp->fl4_dst == htonl(0xFFFFFFFF))) {
2516                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2517                         dev_out = ip_dev_find(net, oldflp->fl4_src);
2518                         if (dev_out == NULL)
2519                                 goto out;
2520
2521                         /* Special hack: user can direct multicasts
2522                            and limited broadcast via necessary interface
2523                            without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2524                            This hack is not just for fun, it allows
2525                            vic,vat and friends to work.
2526                            They bind socket to loopback, set ttl to zero
2527                            and expect that it will work.
2528                            From the viewpoint of routing cache they are broken,
2529                            because we are not allowed to build multicast path
2530                            with loopback source addr (look, routing cache
2531                            cannot know, that ttl is zero, so that packet
2532                            will not leave this host and route is valid).
2533                            Luckily, this hack is good workaround.
2534                          */
2535
2536                         fl.oif = dev_out->ifindex;
2537                         goto make_route;
2538                 }
2539
2540                 if (!(oldflp->flags & FLOWI_FLAG_ANYSRC)) {
2541                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2542                         dev_out = ip_dev_find(net, oldflp->fl4_src);
2543                         if (dev_out == NULL)
2544                                 goto out;
2545                         dev_put(dev_out);
2546                         dev_out = NULL;
2547                 }
2548         }
2549
2550
2551         if (oldflp->oif) {
2552                 dev_out = dev_get_by_index(net, oldflp->oif);
2553                 err = -ENODEV;
2554                 if (dev_out == NULL)
2555                         goto out;
2556
2557                 /* RACE: Check return value of inet_select_addr instead. */
2558                 if (__in_dev_get_rtnl(dev_out) == NULL) {
2559                         dev_put(dev_out);
2560                         goto out;       /* Wrong error code */
2561                 }
2562
2563                 if (ipv4_is_local_multicast(oldflp->fl4_dst) ||
2564                     oldflp->fl4_dst == htonl(0xFFFFFFFF)) {
2565                         if (!fl.fl4_src)
2566                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2567                                                               RT_SCOPE_LINK);
2568                         goto make_route;
2569                 }
2570                 if (!fl.fl4_src) {
2571                         if (ipv4_is_multicast(oldflp->fl4_dst))
2572                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2573                                                               fl.fl4_scope);
2574                         else if (!oldflp->fl4_dst)
2575                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2576                                                               RT_SCOPE_HOST);
2577                 }
2578         }
2579
2580         if (!fl.fl4_dst) {
2581                 fl.fl4_dst = fl.fl4_src;
2582                 if (!fl.fl4_dst)
2583                         fl.fl4_dst = fl.fl4_src = htonl(INADDR_LOOPBACK);
2584                 if (dev_out)
2585                         dev_put(dev_out);
2586                 dev_out = net->loopback_dev;
2587                 dev_hold(dev_out);
2588                 fl.oif = net->loopback_dev->ifindex;
2589                 res.type = RTN_LOCAL;
2590                 flags |= RTCF_LOCAL;
2591                 goto make_route;
2592         }
2593
2594         if (fib_lookup(net, &fl, &res)) {
2595                 res.fi = NULL;
2596                 if (oldflp->oif) {
2597                         /* Apparently, routing tables are wrong. Assume,
2598                            that the destination is on link.
2599
2600                            WHY? DW.
2601                            Because we are allowed to send to iface
2602                            even if it has NO routes and NO assigned
2603                            addresses. When oif is specified, routing
2604                            tables are looked up with only one purpose:
2605                            to catch if destination is gatewayed, rather than
2606                            direct. Moreover, if MSG_DONTROUTE is set,
2607                            we send packet, ignoring both routing tables
2608                            and ifaddr state. --ANK
2609
2610
2611                            We could make it even if oif is unknown,
2612                            likely IPv6, but we do not.
2613                          */
2614
2615                         if (fl.fl4_src == 0)
2616                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2617                                                               RT_SCOPE_LINK);
2618                         res.type = RTN_UNICAST;
2619                         goto make_route;
2620                 }
2621                 if (dev_out)
2622                         dev_put(dev_out);
2623                 err = -ENETUNREACH;
2624                 goto out;
2625         }
2626         free_res = 1;
2627
2628         if (res.type == RTN_LOCAL) {
2629                 if (!fl.fl4_src)
2630                         fl.fl4_src = fl.fl4_dst;
2631                 if (dev_out)
2632                         dev_put(dev_out);
2633                 dev_out = net->loopback_dev;
2634                 dev_hold(dev_out);
2635                 fl.oif = dev_out->ifindex;
2636                 if (res.fi)
2637                         fib_info_put(res.fi);
2638                 res.fi = NULL;
2639                 flags |= RTCF_LOCAL;
2640                 goto make_route;
2641         }
2642
2643 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2644         if (res.fi->fib_nhs > 1 && fl.oif == 0)
2645                 fib_select_multipath(&fl, &res);
2646         else
2647 #endif
2648         if (!res.prefixlen && res.type == RTN_UNICAST && !fl.oif)
2649                 fib_select_default(net, &fl, &res);
2650
2651         if (!fl.fl4_src)
2652                 fl.fl4_src = FIB_RES_PREFSRC(res);
2653
2654         if (dev_out)
2655                 dev_put(dev_out);
2656         dev_out = FIB_RES_DEV(res);
2657         dev_hold(dev_out);
2658         fl.oif = dev_out->ifindex;
2659
2660
2661 make_route:
2662         err = ip_mkroute_output(rp, &res, &fl, oldflp, dev_out, flags);
2663
2664
2665         if (free_res)
2666                 fib_res_put(&res);
2667         if (dev_out)
2668                 dev_put(dev_out);
2669 out:    return err;
2670 }
2671
2672 int __ip_route_output_key(struct net *net, struct rtable **rp,
2673                           const struct flowi *flp)
2674 {
2675         unsigned hash;
2676         struct rtable *rth;
2677
2678         if (!rt_caching(net))
2679                 goto slow_output;
2680
2681         hash = rt_hash(flp->fl4_dst, flp->fl4_src, flp->oif, rt_genid(net));
2682
2683         rcu_read_lock_bh();
2684         for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2685                 rth = rcu_dereference(rth->u.dst.rt_next)) {
2686                 if (rth->fl.fl4_dst == flp->fl4_dst &&
2687                     rth->fl.fl4_src == flp->fl4_src &&
2688                     rth->fl.iif == 0 &&
2689                     rth->fl.oif == flp->oif &&
2690                     rth->fl.mark == flp->mark &&
2691                     !((rth->fl.fl4_tos ^ flp->fl4_tos) &
2692                             (IPTOS_RT_MASK | RTO_ONLINK)) &&
2693                     net_eq(dev_net(rth->u.dst.dev), net) &&
2694                     !rt_is_expired(rth)) {
2695                         dst_use(&rth->u.dst, jiffies);
2696                         RT_CACHE_STAT_INC(out_hit);
2697                         rcu_read_unlock_bh();
2698                         *rp = rth;
2699                         return 0;
2700                 }
2701                 RT_CACHE_STAT_INC(out_hlist_search);
2702         }
2703         rcu_read_unlock_bh();
2704
2705 slow_output:
2706         return ip_route_output_slow(net, rp, flp);
2707 }
2708
2709 EXPORT_SYMBOL_GPL(__ip_route_output_key);
2710
2711 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
2712 {
2713 }
2714
2715 static struct dst_ops ipv4_dst_blackhole_ops = {
2716         .family                 =       AF_INET,
2717         .protocol               =       cpu_to_be16(ETH_P_IP),
2718         .destroy                =       ipv4_dst_destroy,
2719         .check                  =       ipv4_dst_check,
2720         .update_pmtu            =       ipv4_rt_blackhole_update_pmtu,
2721         .entries                =       ATOMIC_INIT(0),
2722 };
2723
2724
2725 static int ipv4_dst_blackhole(struct net *net, struct rtable **rp, struct flowi *flp)
2726 {
2727         struct rtable *ort = *rp;
2728         struct rtable *rt = (struct rtable *)
2729                 dst_alloc(&ipv4_dst_blackhole_ops);
2730
2731         if (rt) {
2732                 struct dst_entry *new = &rt->u.dst;
2733
2734                 atomic_set(&new->__refcnt, 1);
2735                 new->__use = 1;
2736                 new->input = dst_discard;
2737                 new->output = dst_discard;
2738                 memcpy(new->metrics, ort->u.dst.metrics, RTAX_MAX*sizeof(u32));
2739
2740                 new->dev = ort->u.dst.dev;
2741                 if (new->dev)
2742                         dev_hold(new->dev);
2743
2744                 rt->fl = ort->fl;
2745
2746                 rt->idev = ort->idev;
2747                 if (rt->idev)
2748                         in_dev_hold(rt->idev);
2749                 rt->rt_genid = rt_genid(net);
2750                 rt->rt_flags = ort->rt_flags;
2751                 rt->rt_type = ort->rt_type;
2752                 rt->rt_dst = ort->rt_dst;
2753                 rt->rt_src = ort->rt_src;
2754                 rt->rt_iif = ort->rt_iif;
2755                 rt->rt_gateway = ort->rt_gateway;
2756                 rt->rt_spec_dst = ort->rt_spec_dst;
2757                 rt->peer = ort->peer;
2758                 if (rt->peer)
2759                         atomic_inc(&rt->peer->refcnt);
2760
2761                 dst_free(new);
2762         }
2763
2764         dst_release(&(*rp)->u.dst);
2765         *rp = rt;
2766         return (rt ? 0 : -ENOMEM);
2767 }
2768
2769 int ip_route_output_flow(struct net *net, struct rtable **rp, struct flowi *flp,
2770                          struct sock *sk, int flags)
2771 {
2772         int err;
2773
2774         if ((err = __ip_route_output_key(net, rp, flp)) != 0)
2775                 return err;
2776
2777         if (flp->proto) {
2778                 if (!flp->fl4_src)
2779                         flp->fl4_src = (*rp)->rt_src;
2780                 if (!flp->fl4_dst)
2781                         flp->fl4_dst = (*rp)->rt_dst;
2782                 err = __xfrm_lookup(net, (struct dst_entry **)rp, flp, sk,
2783                                     flags ? XFRM_LOOKUP_WAIT : 0);
2784                 if (err == -EREMOTE)
2785                         err = ipv4_dst_blackhole(net, rp, flp);
2786
2787                 return err;
2788         }
2789
2790         return 0;
2791 }
2792
2793 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2794
2795 int ip_route_output_key(struct net *net, struct rtable **rp, struct flowi *flp)
2796 {
2797         return ip_route_output_flow(net, rp, flp, NULL, 0);
2798 }
2799
2800 static int rt_fill_info(struct net *net,
2801                         struct sk_buff *skb, u32 pid, u32 seq, int event,
2802                         int nowait, unsigned int flags)
2803 {
2804         struct rtable *rt = skb_rtable(skb);
2805         struct rtmsg *r;
2806         struct nlmsghdr *nlh;
2807         long expires;
2808         u32 id = 0, ts = 0, tsage = 0, error;
2809
2810         nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
2811         if (nlh == NULL)
2812                 return -EMSGSIZE;
2813
2814         r = nlmsg_data(nlh);
2815         r->rtm_family    = AF_INET;
2816         r->rtm_dst_len  = 32;
2817         r->rtm_src_len  = 0;
2818         r->rtm_tos      = rt->fl.fl4_tos;
2819         r->rtm_table    = RT_TABLE_MAIN;
2820         NLA_PUT_U32(skb, RTA_TABLE, RT_TABLE_MAIN);
2821         r->rtm_type     = rt->rt_type;
2822         r->rtm_scope    = RT_SCOPE_UNIVERSE;
2823         r->rtm_protocol = RTPROT_UNSPEC;
2824         r->rtm_flags    = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2825         if (rt->rt_flags & RTCF_NOTIFY)
2826                 r->rtm_flags |= RTM_F_NOTIFY;
2827
2828         NLA_PUT_BE32(skb, RTA_DST, rt->rt_dst);
2829
2830         if (rt->fl.fl4_src) {
2831                 r->rtm_src_len = 32;
2832                 NLA_PUT_BE32(skb, RTA_SRC, rt->fl.fl4_src);
2833         }
2834         if (rt->u.dst.dev)
2835                 NLA_PUT_U32(skb, RTA_OIF, rt->u.dst.dev->ifindex);
2836 #ifdef CONFIG_NET_CLS_ROUTE
2837         if (rt->u.dst.tclassid)
2838                 NLA_PUT_U32(skb, RTA_FLOW, rt->u.dst.tclassid);
2839 #endif
2840         if (rt->fl.iif)
2841                 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_spec_dst);
2842         else if (rt->rt_src != rt->fl.fl4_src)
2843                 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_src);
2844
2845         if (rt->rt_dst != rt->rt_gateway)
2846                 NLA_PUT_BE32(skb, RTA_GATEWAY, rt->rt_gateway);
2847
2848         if (rtnetlink_put_metrics(skb, rt->u.dst.metrics) < 0)
2849                 goto nla_put_failure;
2850
2851         error = rt->u.dst.error;
2852         expires = rt->u.dst.expires ? rt->u.dst.expires - jiffies : 0;
2853         if (rt->peer) {
2854                 id = atomic_read(&rt->peer->ip_id_count) & 0xffff;
2855                 if (rt->peer->tcp_ts_stamp) {
2856                         ts = rt->peer->tcp_ts;
2857                         tsage = get_seconds() - rt->peer->tcp_ts_stamp;
2858                 }
2859         }
2860
2861         if (rt->fl.iif) {
2862 #ifdef CONFIG_IP_MROUTE
2863                 __be32 dst = rt->rt_dst;
2864
2865                 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
2866                     IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2867                         int err = ipmr_get_route(net, skb, r, nowait);
2868                         if (err <= 0) {
2869                                 if (!nowait) {
2870                                         if (err == 0)
2871                                                 return 0;
2872                                         goto nla_put_failure;
2873                                 } else {
2874                                         if (err == -EMSGSIZE)
2875                                                 goto nla_put_failure;
2876                                         error = err;
2877                                 }
2878                         }
2879                 } else
2880 #endif
2881                         NLA_PUT_U32(skb, RTA_IIF, rt->fl.iif);
2882         }
2883
2884         if (rtnl_put_cacheinfo(skb, &rt->u.dst, id, ts, tsage,
2885                                expires, error) < 0)
2886                 goto nla_put_failure;
2887
2888         return nlmsg_end(skb, nlh);
2889
2890 nla_put_failure:
2891         nlmsg_cancel(skb, nlh);
2892         return -EMSGSIZE;
2893 }
2894
2895 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2896 {
2897         struct net *net = sock_net(in_skb->sk);
2898         struct rtmsg *rtm;
2899         struct nlattr *tb[RTA_MAX+1];
2900         struct rtable *rt = NULL;
2901         __be32 dst = 0;
2902         __be32 src = 0;
2903         u32 iif;
2904         int err;
2905         struct sk_buff *skb;
2906
2907         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
2908         if (err < 0)
2909                 goto errout;
2910
2911         rtm = nlmsg_data(nlh);
2912
2913         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2914         if (skb == NULL) {
2915                 err = -ENOBUFS;
2916                 goto errout;
2917         }
2918
2919         /* Reserve room for dummy headers, this skb can pass
2920            through good chunk of routing engine.
2921          */
2922         skb_reset_mac_header(skb);
2923         skb_reset_network_header(skb);
2924
2925         /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
2926         ip_hdr(skb)->protocol = IPPROTO_ICMP;
2927         skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2928
2929         src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
2930         dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
2931         iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
2932
2933         if (iif) {
2934                 struct net_device *dev;
2935
2936                 dev = __dev_get_by_index(net, iif);
2937                 if (dev == NULL) {
2938                         err = -ENODEV;
2939                         goto errout_free;
2940                 }
2941
2942                 skb->protocol   = htons(ETH_P_IP);
2943                 skb->dev        = dev;
2944                 local_bh_disable();
2945                 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2946                 local_bh_enable();
2947
2948                 rt = skb_rtable(skb);
2949                 if (err == 0 && rt->u.dst.error)
2950                         err = -rt->u.dst.error;
2951         } else {
2952                 struct flowi fl = {
2953                         .nl_u = {
2954                                 .ip4_u = {
2955                                         .daddr = dst,
2956                                         .saddr = src,
2957                                         .tos = rtm->rtm_tos,
2958                                 },
2959                         },
2960                         .oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
2961                 };
2962                 err = ip_route_output_key(net, &rt, &fl);
2963         }
2964
2965         if (err)
2966                 goto errout_free;
2967
2968         skb_dst_set(skb, &rt->u.dst);
2969         if (rtm->rtm_flags & RTM_F_NOTIFY)
2970                 rt->rt_flags |= RTCF_NOTIFY;
2971
2972         err = rt_fill_info(net, skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
2973                            RTM_NEWROUTE, 0, 0);
2974         if (err <= 0)
2975                 goto errout_free;
2976
2977         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
2978 errout:
2979         return err;
2980
2981 errout_free:
2982         kfree_skb(skb);
2983         goto errout;
2984 }
2985
2986 int ip_rt_dump(struct sk_buff *skb,  struct netlink_callback *cb)
2987 {
2988         struct rtable *rt;
2989         int h, s_h;
2990         int idx, s_idx;
2991         struct net *net;
2992
2993         net = sock_net(skb->sk);
2994
2995         s_h = cb->args[0];
2996         if (s_h < 0)
2997                 s_h = 0;
2998         s_idx = idx = cb->args[1];
2999         for (h = s_h; h <= rt_hash_mask; h++, s_idx = 0) {
3000                 if (!rt_hash_table[h].chain)
3001                         continue;
3002                 rcu_read_lock_bh();
3003                 for (rt = rcu_dereference(rt_hash_table[h].chain), idx = 0; rt;
3004                      rt = rcu_dereference(rt->u.dst.rt_next), idx++) {
3005                         if (!net_eq(dev_net(rt->u.dst.dev), net) || idx < s_idx)
3006                                 continue;
3007                         if (rt_is_expired(rt))
3008                                 continue;
3009                         skb_dst_set(skb, dst_clone(&rt->u.dst));
3010                         if (rt_fill_info(net, skb, NETLINK_CB(cb->skb).pid,
3011                                          cb->nlh->nlmsg_seq, RTM_NEWROUTE,
3012                                          1, NLM_F_MULTI) <= 0) {
3013                                 skb_dst_drop(skb);
3014                                 rcu_read_unlock_bh();
3015                                 goto done;
3016                         }
3017                         skb_dst_drop(skb);
3018                 }
3019                 rcu_read_unlock_bh();
3020         }
3021
3022 done:
3023         cb->args[0] = h;
3024         cb->args[1] = idx;
3025         return skb->len;
3026 }
3027
3028 void ip_rt_multicast_event(struct in_device *in_dev)
3029 {
3030         rt_cache_flush(dev_net(in_dev->dev), 0);
3031 }
3032
3033 #ifdef CONFIG_SYSCTL
3034 static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write,
3035                                         void __user *buffer,
3036                                         size_t *lenp, loff_t *ppos)
3037 {
3038         if (write) {
3039                 int flush_delay;
3040                 ctl_table ctl;
3041                 struct net *net;
3042
3043                 memcpy(&ctl, __ctl, sizeof(ctl));
3044                 ctl.data = &flush_delay;
3045                 proc_dointvec(&ctl, write, buffer, lenp, ppos);
3046
3047                 net = (struct net *)__ctl->extra1;
3048                 rt_cache_flush(net, flush_delay);
3049                 return 0;
3050         }
3051
3052         return -EINVAL;
3053 }
3054
3055 static int ipv4_sysctl_rtcache_flush_strategy(ctl_table *table,
3056                                                 void __user *oldval,
3057                                                 size_t __user *oldlenp,
3058                                                 void __user *newval,
3059                                                 size_t newlen)
3060 {
3061         int delay;
3062         struct net *net;
3063         if (newlen != sizeof(int))
3064                 return -EINVAL;
3065         if (get_user(delay, (int __user *)newval))
3066                 return -EFAULT;
3067         net = (struct net *)table->extra1;
3068         rt_cache_flush(net, delay);
3069         return 0;
3070 }
3071
3072 static void rt_secret_reschedule(int old)
3073 {
3074         struct net *net;
3075         int new = ip_rt_secret_interval;
3076         int diff = new - old;
3077
3078         if (!diff)
3079                 return;
3080
3081         rtnl_lock();
3082         for_each_net(net) {
3083                 int deleted = del_timer_sync(&net->ipv4.rt_secret_timer);
3084
3085                 if (!new)
3086                         continue;
3087
3088                 if (deleted) {
3089                         long time = net->ipv4.rt_secret_timer.expires - jiffies;
3090
3091                         if (time <= 0 || (time += diff) <= 0)
3092                                 time = 0;
3093
3094                         net->ipv4.rt_secret_timer.expires = time;
3095                 } else
3096                         net->ipv4.rt_secret_timer.expires = new;
3097
3098                 net->ipv4.rt_secret_timer.expires += jiffies;
3099                 add_timer(&net->ipv4.rt_secret_timer);
3100         }
3101         rtnl_unlock();
3102 }
3103
3104 static int ipv4_sysctl_rt_secret_interval(ctl_table *ctl, int write,
3105                                           void __user *buffer, size_t *lenp,
3106                                           loff_t *ppos)
3107 {
3108         int old = ip_rt_secret_interval;
3109         int ret = proc_dointvec_jiffies(ctl, write, buffer, lenp, ppos);
3110
3111         rt_secret_reschedule(old);
3112
3113         return ret;
3114 }
3115
3116 static int ipv4_sysctl_rt_secret_interval_strategy(ctl_table *table,
3117                                                    void __user *oldval,
3118                                                    size_t __user *oldlenp,
3119                                                    void __user *newval,
3120                                                    size_t newlen)
3121 {
3122         int old = ip_rt_secret_interval;
3123         int ret = sysctl_jiffies(table, oldval, oldlenp, newval, newlen);
3124
3125         rt_secret_reschedule(old);
3126
3127         return ret;
3128 }
3129
3130 static ctl_table ipv4_route_table[] = {
3131         {
3132                 .ctl_name       = NET_IPV4_ROUTE_GC_THRESH,
3133                 .procname       = "gc_thresh",
3134                 .data           = &ipv4_dst_ops.gc_thresh,
3135                 .maxlen         = sizeof(int),
3136                 .mode           = 0644,
3137                 .proc_handler   = proc_dointvec,
3138         },
3139         {
3140                 .ctl_name       = NET_IPV4_ROUTE_MAX_SIZE,
3141                 .procname       = "max_size",
3142                 .data           = &ip_rt_max_size,
3143                 .maxlen         = sizeof(int),
3144                 .mode           = 0644,
3145                 .proc_handler   = proc_dointvec,
3146         },
3147         {
3148                 /*  Deprecated. Use gc_min_interval_ms */
3149
3150                 .ctl_name       = NET_IPV4_ROUTE_GC_MIN_INTERVAL,
3151                 .procname       = "gc_min_interval",
3152                 .data           = &ip_rt_gc_min_interval,
3153                 .maxlen         = sizeof(int),
3154                 .mode           = 0644,
3155                 .proc_handler   = proc_dointvec_jiffies,
3156                 .strategy       = sysctl_jiffies,
3157         },
3158         {
3159                 .ctl_name       = NET_IPV4_ROUTE_GC_MIN_INTERVAL_MS,
3160                 .procname       = "gc_min_interval_ms",
3161                 .data           = &ip_rt_gc_min_interval,
3162                 .maxlen         = sizeof(int),
3163                 .mode           = 0644,
3164                 .proc_handler   = proc_dointvec_ms_jiffies,
3165                 .strategy       = sysctl_ms_jiffies,
3166         },
3167         {
3168                 .ctl_name       = NET_IPV4_ROUTE_GC_TIMEOUT,
3169                 .procname       = "gc_timeout",
3170                 .data           = &ip_rt_gc_timeout,
3171                 .maxlen         = sizeof(int),
3172                 .mode           = 0644,
3173                 .proc_handler   = proc_dointvec_jiffies,
3174                 .strategy       = sysctl_jiffies,
3175         },
3176         {
3177                 .ctl_name       = NET_IPV4_ROUTE_GC_INTERVAL,
3178                 .procname       = "gc_interval",
3179                 .data           = &ip_rt_gc_interval,
3180                 .maxlen         = sizeof(int),
3181                 .mode           = 0644,
3182                 .proc_handler   = proc_dointvec_jiffies,
3183                 .strategy       = sysctl_jiffies,
3184         },
3185         {
3186                 .ctl_name       = NET_IPV4_ROUTE_REDIRECT_LOAD,
3187                 .procname       = "redirect_load",
3188                 .data           = &ip_rt_redirect_load,
3189                 .maxlen         = sizeof(int),
3190                 .mode           = 0644,
3191                 .proc_handler   = proc_dointvec,
3192         },
3193         {
3194                 .ctl_name       = NET_IPV4_ROUTE_REDIRECT_NUMBER,
3195                 .procname       = "redirect_number",
3196                 .data           = &ip_rt_redirect_number,
3197                 .maxlen         = sizeof(int),
3198                 .mode           = 0644,
3199                 .proc_handler   = proc_dointvec,
3200         },
3201         {
3202                 .ctl_name       = NET_IPV4_ROUTE_REDIRECT_SILENCE,
3203                 .procname       = "redirect_silence",
3204                 .data           = &ip_rt_redirect_silence,
3205                 .maxlen         = sizeof(int),
3206                 .mode           = 0644,
3207                 .proc_handler   = proc_dointvec,
3208         },
3209         {
3210                 .ctl_name       = NET_IPV4_ROUTE_ERROR_COST,
3211                 .procname       = "error_cost",
3212                 .data           = &ip_rt_error_cost,
3213                 .maxlen         = sizeof(int),
3214                 .mode           = 0644,
3215                 .proc_handler   = proc_dointvec,
3216         },
3217         {
3218                 .ctl_name       = NET_IPV4_ROUTE_ERROR_BURST,
3219                 .procname       = "error_burst",
3220                 .data           = &ip_rt_error_burst,
3221                 .maxlen         = sizeof(int),
3222                 .mode           = 0644,
3223                 .proc_handler   = proc_dointvec,
3224         },
3225         {
3226                 .ctl_name       = NET_IPV4_ROUTE_GC_ELASTICITY,
3227                 .procname       = "gc_elasticity",
3228                 .data           = &ip_rt_gc_elasticity,
3229                 .maxlen         = sizeof(int),
3230                 .mode           = 0644,
3231                 .proc_handler   = proc_dointvec,
3232         },
3233         {
3234                 .ctl_name       = NET_IPV4_ROUTE_MTU_EXPIRES,
3235                 .procname       = "mtu_expires",
3236                 .data           = &ip_rt_mtu_expires,
3237                 .maxlen         = sizeof(int),
3238                 .mode           = 0644,
3239                 .proc_handler   = proc_dointvec_jiffies,
3240                 .strategy       = sysctl_jiffies,
3241         },
3242         {
3243                 .ctl_name       = NET_IPV4_ROUTE_MIN_PMTU,
3244                 .procname       = "min_pmtu",
3245                 .data           = &ip_rt_min_pmtu,
3246                 .maxlen         = sizeof(int),
3247                 .mode           = 0644,
3248                 .proc_handler   = proc_dointvec,
3249         },
3250         {
3251                 .ctl_name       = NET_IPV4_ROUTE_MIN_ADVMSS,
3252                 .procname       = "min_adv_mss",
3253                 .data           = &ip_rt_min_advmss,
3254                 .maxlen         = sizeof(int),
3255                 .mode           = 0644,
3256                 .proc_handler   = proc_dointvec,
3257         },
3258         {
3259                 .ctl_name       = NET_IPV4_ROUTE_SECRET_INTERVAL,
3260                 .procname       = "secret_interval",
3261                 .data           = &ip_rt_secret_interval,
3262                 .maxlen         = sizeof(int),
3263                 .mode           = 0644,
3264                 .proc_handler   = ipv4_sysctl_rt_secret_interval,
3265                 .strategy       = ipv4_sysctl_rt_secret_interval_strategy,
3266         },
3267         { .ctl_name = 0 }
3268 };
3269
3270 static struct ctl_table empty[1];
3271
3272 static struct ctl_table ipv4_skeleton[] =
3273 {
3274         { .procname = "route", .ctl_name = NET_IPV4_ROUTE,
3275           .mode = 0555, .child = ipv4_route_table},
3276         { .procname = "neigh", .ctl_name = NET_IPV4_NEIGH,
3277           .mode = 0555, .child = empty},
3278         { }
3279 };
3280
3281 static __net_initdata struct ctl_path ipv4_path[] = {
3282         { .procname = "net", .ctl_name = CTL_NET, },
3283         { .procname = "ipv4", .ctl_name = NET_IPV4, },
3284         { },
3285 };
3286
3287 static struct ctl_table ipv4_route_flush_table[] = {
3288         {
3289                 .ctl_name       = NET_IPV4_ROUTE_FLUSH,
3290                 .procname       = "flush",
3291                 .maxlen         = sizeof(int),
3292                 .mode           = 0200,
3293                 .proc_handler   = ipv4_sysctl_rtcache_flush,
3294                 .strategy       = ipv4_sysctl_rtcache_flush_strategy,
3295         },
3296         { .ctl_name = 0 },
3297 };
3298
3299 static __net_initdata struct ctl_path ipv4_route_path[] = {
3300         { .procname = "net", .ctl_name = CTL_NET, },
3301         { .procname = "ipv4", .ctl_name = NET_IPV4, },
3302         { .procname = "route", .ctl_name = NET_IPV4_ROUTE, },
3303         { },
3304 };
3305
3306 static __net_init int sysctl_route_net_init(struct net *net)
3307 {
3308         struct ctl_table *tbl;
3309
3310         tbl = ipv4_route_flush_table;
3311         if (!net_eq(net, &init_net)) {
3312                 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
3313                 if (tbl == NULL)
3314                         goto err_dup;
3315         }
3316         tbl[0].extra1 = net;
3317
3318         net->ipv4.route_hdr =
3319                 register_net_sysctl_table(net, ipv4_route_path, tbl);
3320         if (net->ipv4.route_hdr == NULL)
3321                 goto err_reg;
3322         return 0;
3323
3324 err_reg:
3325         if (tbl != ipv4_route_flush_table)
3326                 kfree(tbl);
3327 err_dup:
3328         return -ENOMEM;
3329 }
3330
3331 static __net_exit void sysctl_route_net_exit(struct net *net)
3332 {
3333         struct ctl_table *tbl;
3334
3335         tbl = net->ipv4.route_hdr->ctl_table_arg;
3336         unregister_net_sysctl_table(net->ipv4.route_hdr);
3337         BUG_ON(tbl == ipv4_route_flush_table);
3338         kfree(tbl);
3339 }
3340
3341 static __net_initdata struct pernet_operations sysctl_route_ops = {
3342         .init = sysctl_route_net_init,
3343         .exit = sysctl_route_net_exit,
3344 };
3345 #endif
3346
3347
3348 static __net_init int rt_secret_timer_init(struct net *net)
3349 {
3350         atomic_set(&net->ipv4.rt_genid,
3351                         (int) ((num_physpages ^ (num_physpages>>8)) ^
3352                         (jiffies ^ (jiffies >> 7))));
3353
3354         net->ipv4.rt_secret_timer.function = rt_secret_rebuild;
3355         net->ipv4.rt_secret_timer.data = (unsigned long)net;
3356         init_timer_deferrable(&net->ipv4.rt_secret_timer);
3357
3358         if (ip_rt_secret_interval) {
3359                 net->ipv4.rt_secret_timer.expires =
3360                         jiffies + net_random() % ip_rt_secret_interval +
3361                         ip_rt_secret_interval;
3362                 add_timer(&net->ipv4.rt_secret_timer);
3363         }
3364         return 0;
3365 }
3366
3367 static __net_exit void rt_secret_timer_exit(struct net *net)
3368 {
3369         del_timer_sync(&net->ipv4.rt_secret_timer);
3370 }
3371
3372 static __net_initdata struct pernet_operations rt_secret_timer_ops = {
3373         .init = rt_secret_timer_init,
3374         .exit = rt_secret_timer_exit,
3375 };
3376
3377
3378 #ifdef CONFIG_NET_CLS_ROUTE
3379 struct ip_rt_acct *ip_rt_acct __read_mostly;
3380 #endif /* CONFIG_NET_CLS_ROUTE */
3381
3382 static __initdata unsigned long rhash_entries;
3383 static int __init set_rhash_entries(char *str)
3384 {
3385         if (!str)
3386                 return 0;
3387         rhash_entries = simple_strtoul(str, &str, 0);
3388         return 1;
3389 }
3390 __setup("rhash_entries=", set_rhash_entries);
3391
3392 int __init ip_rt_init(void)
3393 {
3394         int rc = 0;
3395
3396 #ifdef CONFIG_NET_CLS_ROUTE
3397         ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
3398         if (!ip_rt_acct)
3399                 panic("IP: failed to allocate ip_rt_acct\n");
3400 #endif
3401
3402         ipv4_dst_ops.kmem_cachep =
3403                 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3404                                   SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3405
3406         ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3407
3408         rt_hash_table = (struct rt_hash_bucket *)
3409                 alloc_large_system_hash("IP route cache",
3410                                         sizeof(struct rt_hash_bucket),
3411                                         rhash_entries,
3412                                         (totalram_pages >= 128 * 1024) ?
3413                                         15 : 17,
3414                                         0,
3415                                         &rt_hash_log,
3416                                         &rt_hash_mask,
3417                                         rhash_entries ? 0 : 512 * 1024);
3418         memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
3419         rt_hash_lock_init();
3420
3421         ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
3422         ip_rt_max_size = (rt_hash_mask + 1) * 16;
3423
3424         devinet_init();
3425         ip_fib_init();
3426
3427         /* All the timers, started at system startup tend
3428            to synchronize. Perturb it a bit.
3429          */
3430         INIT_DELAYED_WORK_DEFERRABLE(&expires_work, rt_worker_func);
3431         expires_ljiffies = jiffies;
3432         schedule_delayed_work(&expires_work,
3433                 net_random() % ip_rt_gc_interval + ip_rt_gc_interval);
3434
3435         if (register_pernet_subsys(&rt_secret_timer_ops))
3436                 printk(KERN_ERR "Unable to setup rt_secret_timer\n");
3437
3438         if (ip_rt_proc_init())
3439                 printk(KERN_ERR "Unable to create route proc files\n");
3440 #ifdef CONFIG_XFRM
3441         xfrm_init();
3442         xfrm4_init(ip_rt_max_size);
3443 #endif
3444         rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL);
3445
3446 #ifdef CONFIG_SYSCTL
3447         register_pernet_subsys(&sysctl_route_ops);
3448 #endif
3449         return rc;
3450 }
3451
3452 #ifdef CONFIG_SYSCTL
3453 /*
3454  * We really need to sanitize the damn ipv4 init order, then all
3455  * this nonsense will go away.
3456  */
3457 void __init ip_static_sysctl_init(void)
3458 {
3459         register_sysctl_paths(ipv4_path, ipv4_skeleton);
3460 }
3461 #endif
3462
3463 EXPORT_SYMBOL(__ip_select_ident);
3464 EXPORT_SYMBOL(ip_route_input);
3465 EXPORT_SYMBOL(ip_route_output_key);