net/ipv4/route.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              ROUTE - implementation of the IP router.
   7  *
   8  * Authors:     Ross Biro
   9  *              Fred N. van Kempen, <[email protected]>
  10  *              Alan Cox, <[email protected]>
  11  *              Linus Torvalds, <[email protected]>
  12  *              Alexey Kuznetsov, <[email protected]>
  13  *
  14  * Fixes:
  15  *              Alan Cox        :       Verify area fixes.
  16  *              Alan Cox        :       cli() protects routing changes
  17  *              Rui Oliveira    :       ICMP routing table updates
  18  *              ([email protected])      Routing table insertion and update
  19  *              Linus Torvalds  :       Rewrote bits to be sensible
  20  *              Alan Cox        :       Added BSD route gw semantics
  21  *              Alan Cox        :       Super /proc >4K
  22  *              Alan Cox        :       MTU in route table
  23  *              Alan Cox        :       MSS actually. Also added the window
  24  *                                      clamper.
  25  *              Sam Lantinga    :       Fixed route matching in rt_del()
  26  *              Alan Cox        :       Routing cache support.
  27  *              Alan Cox        :       Removed compatibility cruft.
  28  *              Alan Cox        :       RTF_REJECT support.
  29  *              Alan Cox        :       TCP irtt support.
  30  *              Jonathan Naylor :       Added Metric support.
  31  *      Miquel van Smoorenburg  :       BSD API fixes.
  32  *      Miquel van Smoorenburg  :       Metrics.
  33  *              Alan Cox        :       Use __u32 properly
  34  *              Alan Cox        :       Aligned routing errors more closely with BSD
  35  *                                      our system is still very different.
  36  *              Alan Cox        :       Faster /proc handling
  37  *      Alexey Kuznetsov        :       Massive rework to support tree based routing,
  38  *                                      routing caches and better behaviour.
  39  *
  40  *              Olaf Erb        :       irtt wasn't being copied right.
  41  *              Bjorn Ekwall    :       Kerneld route support.
  42  *              Alan Cox        :       Multicast fixed (I hope)
  43  *              Pavel Krauz     :       Limited broadcast fixed
  44  *              Mike McLagan    :       Routing by source
  45  *      Alexey Kuznetsov        :       End of old history. Split to fib.c and
  46  *                                      route.c and rewritten from scratch.
  47  *              Andi Kleen      :       Load-limit warning messages.
  48  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
  49  *      Vitaly E. Lavrov        :       Race condition in ip_route_input_slow.
  50  *      Tobias Ringstrom        :       Uninitialized res.type in ip_route_output_slow.
  51  *      Vladimir V. Ivanov      :       IP rule info (flowid) is really useful.
  52  *              Marc Boucher    :       routing by fwmark
  53  *      Robert Olsson           :       Added rt_cache statistics
  54  *      Arnaldo C. Melo         :       Convert proc stuff to seq_file
  55  *      Eric Dumazet            :       hashed spinlocks and rt_check_expire() fixes.
  56  *      Ilia Sotnikov           :       Ignore TOS on PMTUD and Redirect
  57  *      Ilia Sotnikov           :       Removed TOS from hash calculations
  58  *
  59  *              This program is free software; you can redistribute it and/or
  60  *              modify it under the terms of the GNU General Public License
  61  *              as published by the Free Software Foundation; either version
  62  *              2 of the License, or (at your option) any later version.
  63  */
  64
  65 #include <linux/module.h>
  66 #include <asm/uaccess.h>
  67 #include <asm/system.h>
  68 #include <linux/bitops.h>
  69 #include <linux/types.h>
  70 #include <linux/kernel.h>
  71 #include <linux/mm.h>
  72 #include <linux/bootmem.h>
  73 #include <linux/string.h>
  74 #include <linux/socket.h>
  75 #include <linux/sockios.h>
  76 #include <linux/errno.h>
  77 #include <linux/in.h>
  78 #include <linux/inet.h>
  79 #include <linux/netdevice.h>
  80 #include <linux/proc_fs.h>
  81 #include <linux/init.h>
  82 #include <linux/workqueue.h>
  83 #include <linux/skbuff.h>
  84 #include <linux/inetdevice.h>
  85 #include <linux/igmp.h>
  86 #include <linux/pkt_sched.h>
  87 #include <linux/mroute.h>
  88 #include <linux/netfilter_ipv4.h>
  89 #include <linux/random.h>
  90 #include <linux/jhash.h>
  91 #include <linux/rcupdate.h>
  92 #include <linux/times.h>
  93 #include <linux/slab.h>
  94 #include <net/dst.h>
  95 #include <net/net_namespace.h>
  96 #include <net/protocol.h>
  97 #include <net/ip.h>
  98 #include <net/route.h>
  99 #include <net/inetpeer.h>
 100 #include <net/sock.h>
 101 #include <net/ip_fib.h>
 102 #include <net/arp.h>
 103 #include <net/tcp.h>
 104 #include <net/icmp.h>
 105 #include <net/xfrm.h>
 106 #include <net/netevent.h>
 107 #include <net/rtnetlink.h>
 108 #ifdef CONFIG_SYSCTL
 109 #include <linux/sysctl.h>
 110 #endif
 111 #include <net/atmclip.h>
 112 #include <net/secure_seq.h>
 113
 114 #define RT_FL_TOS(oldflp4) \
 115     ((u32)(oldflp4->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK)))
 116
 117 #define IP_MAX_MTU      0xFFF0
 118
 119 #define RT_GC_TIMEOUT (300*HZ)
 120
 121 static int ip_rt_max_size;
 122 static int ip_rt_gc_timeout __read_mostly       = RT_GC_TIMEOUT;
 123 static int ip_rt_gc_min_interval __read_mostly  = HZ / 2;
 124 static int ip_rt_redirect_number __read_mostly  = 9;
 125 static int ip_rt_redirect_load __read_mostly    = HZ / 50;
 126 static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
 127 static int ip_rt_error_cost __read_mostly       = HZ;
 128 static int ip_rt_error_burst __read_mostly      = 5 * HZ;
 129 static int ip_rt_gc_elasticity __read_mostly    = 8;
 130 static int ip_rt_mtu_expires __read_mostly      = 10 * 60 * HZ;
 131 static int ip_rt_min_pmtu __read_mostly         = 512 + 20 + 20;
 132 static int ip_rt_min_advmss __read_mostly       = 256;
 133 static int rt_chain_length_max __read_mostly    = 20;
 134
 135 /*
 136  *      Interface to generic destination cache.
 137  */
 138
 139 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
 140 static unsigned int      ipv4_default_advmss(const struct dst_entry *dst);
 141 static unsigned int      ipv4_default_mtu(const struct dst_entry *dst);
 142 static void              ipv4_dst_destroy(struct dst_entry *dst);
 143 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
 144 static void              ipv4_link_failure(struct sk_buff *skb);
 145 static void              ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
 146 static int rt_garbage_collect(struct dst_ops *ops);
 147
 148 static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
 149                             int how)
 150 {
 151 }
 152
 153 static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
 154 {
 155         struct rtable *rt = (struct rtable *) dst;
 156         struct inet_peer *peer;
 157         u32 *p = NULL;
 158
 159         if (!rt->peer)
 160                 rt_bind_peer(rt, rt->rt_dst, 1);
 161
 162         peer = rt->peer;
 163         if (peer) {
 164                 u32 *old_p = __DST_METRICS_PTR(old);
 165                 unsigned long prev, new;
 166
 167                 p = peer->metrics;
 168                 if (inet_metrics_new(peer))
 169                         memcpy(p, old_p, sizeof(u32) * RTAX_MAX);
 170
 171                 new = (unsigned long) p;
 172                 prev = cmpxchg(&dst->_metrics, old, new);
 173
 174                 if (prev != old) {
 175                         p = __DST_METRICS_PTR(prev);
 176                         if (prev & DST_METRICS_READ_ONLY)
 177                                 p = NULL;
 178                 } else {
 179                         if (rt->fi) {
 180                                 fib_info_put(rt->fi);
 181                                 rt->fi = NULL;
 182                         }
 183                 }
 184         }
 185         return p;
 186 }
 187
 188 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, const void *daddr);
 189
 190 static struct dst_ops ipv4_dst_ops = {
 191         .family =               AF_INET,
 192         .protocol =             cpu_to_be16(ETH_P_IP),
 193         .gc =                   rt_garbage_collect,
 194         .check =                ipv4_dst_check,
 195         .default_advmss =       ipv4_default_advmss,
 196         .default_mtu =          ipv4_default_mtu,
 197         .cow_metrics =          ipv4_cow_metrics,
 198         .destroy =              ipv4_dst_destroy,
 199         .ifdown =               ipv4_dst_ifdown,
 200         .negative_advice =      ipv4_negative_advice,
 201         .link_failure =         ipv4_link_failure,
 202         .update_pmtu =          ip_rt_update_pmtu,
 203         .local_out =            __ip_local_out,
 204         .neigh_lookup =         ipv4_neigh_lookup,
 205 };
 206
 207 #define ECN_OR_COST(class)      TC_PRIO_##class
 208
 209 const __u8 ip_tos2prio[16] = {
 210         TC_PRIO_BESTEFFORT,
 211         ECN_OR_COST(BESTEFFORT),
 212         TC_PRIO_BESTEFFORT,
 213         ECN_OR_COST(BESTEFFORT),
 214         TC_PRIO_BULK,
 215         ECN_OR_COST(BULK),
 216         TC_PRIO_BULK,
 217         ECN_OR_COST(BULK),
 218         TC_PRIO_INTERACTIVE,
 219         ECN_OR_COST(INTERACTIVE),
 220         TC_PRIO_INTERACTIVE,
 221         ECN_OR_COST(INTERACTIVE),
 222         TC_PRIO_INTERACTIVE_BULK,
 223         ECN_OR_COST(INTERACTIVE_BULK),
 224         TC_PRIO_INTERACTIVE_BULK,
 225         ECN_OR_COST(INTERACTIVE_BULK)
 226 };
 227
 228
 229 /*
 230  * Route cache.
 231  */
 232
 233 /* The locking scheme is rather straight forward:
 234  *
 235  * 1) Read-Copy Update protects the buckets of the central route hash.
 236  * 2) Only writers remove entries, and they hold the lock
 237  *    as they look at rtable reference counts.
 238  * 3) Only readers acquire references to rtable entries,
 239  *    they do so with atomic increments and with the
 240  *    lock held.
 241  */
 242
 243 struct rt_hash_bucket {
 244         struct rtable __rcu     *chain;
 245 };
 246
 247 #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
 248         defined(CONFIG_PROVE_LOCKING)
 249 /*
 250  * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
 251  * The size of this table is a power of two and depends on the number of CPUS.
 252  * (on lockdep we have a quite big spinlock_t, so keep the size down there)
 253  */
 254 #ifdef CONFIG_LOCKDEP
 255 # define RT_HASH_LOCK_SZ        256
 256 #else
 257 # if NR_CPUS >= 32
 258 #  define RT_HASH_LOCK_SZ       4096
 259 # elif NR_CPUS >= 16
 260 #  define RT_HASH_LOCK_SZ       2048
 261 # elif NR_CPUS >= 8
 262 #  define RT_HASH_LOCK_SZ       1024
 263 # elif NR_CPUS >= 4
 264 #  define RT_HASH_LOCK_SZ       512
 265 # else
 266 #  define RT_HASH_LOCK_SZ       256
 267 # endif
 268 #endif
 269
 270 static spinlock_t       *rt_hash_locks;
 271 # define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
 272
 273 static __init void rt_hash_lock_init(void)
 274 {
 275         int i;
 276
 277         rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ,
 278                         GFP_KERNEL);
 279         if (!rt_hash_locks)
 280                 panic("IP: failed to allocate rt_hash_locks\n");
 281
 282         for (i = 0; i < RT_HASH_LOCK_SZ; i++)
 283                 spin_lock_init(&rt_hash_locks[i]);
 284 }
 285 #else
 286 # define rt_hash_lock_addr(slot) NULL
 287
 288 static inline void rt_hash_lock_init(void)
 289 {
 290 }
 291 #endif
 292
 293 static struct rt_hash_bucket    *rt_hash_table __read_mostly;
 294 static unsigned                 rt_hash_mask __read_mostly;
 295 static unsigned int             rt_hash_log  __read_mostly;
 296
 297 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
 298 #define RT_CACHE_STAT_INC(field) __this_cpu_inc(rt_cache_stat.field)
 299
 300 static inline unsigned int rt_hash(__be32 daddr, __be32 saddr, int idx,
 301                                    int genid)
 302 {
 303         return jhash_3words((__force u32)daddr, (__force u32)saddr,
 304                             idx, genid)
 305                 & rt_hash_mask;
 306 }
 307
 308 static inline int rt_genid(struct net *net)
 309 {
 310         return atomic_read(&net->ipv4.rt_genid);
 311 }
 312
 313 #ifdef CONFIG_PROC_FS
 314 struct rt_cache_iter_state {
 315         struct seq_net_private p;
 316         int bucket;
 317         int genid;
 318 };
 319
 320 static struct rtable *rt_cache_get_first(struct seq_file *seq)
 321 {
 322         struct rt_cache_iter_state *st = seq->private;
 323         struct rtable *r = NULL;
 324
 325         for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
 326                 if (!rcu_access_pointer(rt_hash_table[st->bucket].chain))
 327                         continue;
 328                 rcu_read_lock_bh();
 329                 r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
 330                 while (r) {
 331                         if (dev_net(r->dst.dev) == seq_file_net(seq) &&
 332                             r->rt_genid == st->genid)
 333                                 return r;
 334                         r = rcu_dereference_bh(r->dst.rt_next);
 335                 }
 336                 rcu_read_unlock_bh();
 337         }
 338         return r;
 339 }
 340
 341 static struct rtable *__rt_cache_get_next(struct seq_file *seq,
 342                                           struct rtable *r)
 343 {
 344         struct rt_cache_iter_state *st = seq->private;
 345
 346         r = rcu_dereference_bh(r->dst.rt_next);
 347         while (!r) {
 348                 rcu_read_unlock_bh();
 349                 do {
 350                         if (--st->bucket < 0)
 351                                 return NULL;
 352                 } while (!rcu_access_pointer(rt_hash_table[st->bucket].chain));
 353                 rcu_read_lock_bh();
 354                 r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
 355         }
 356         return r;
 357 }
 358
 359 static struct rtable *rt_cache_get_next(struct seq_file *seq,
 360                                         struct rtable *r)
 361 {
 362         struct rt_cache_iter_state *st = seq->private;
 363         while ((r = __rt_cache_get_next(seq, r)) != NULL) {
 364                 if (dev_net(r->dst.dev) != seq_file_net(seq))
 365                         continue;
 366                 if (r->rt_genid == st->genid)
 367                         break;
 368         }
 369         return r;
 370 }
 371
 372 static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
 373 {
 374         struct rtable *r = rt_cache_get_first(seq);
 375
 376         if (r)
 377                 while (pos && (r = rt_cache_get_next(seq, r)))
 378                         --pos;
 379         return pos ? NULL : r;
 380 }
 381
 382 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
 383 {
 384         struct rt_cache_iter_state *st = seq->private;
 385         if (*pos)
 386                 return rt_cache_get_idx(seq, *pos - 1);
 387         st->genid = rt_genid(seq_file_net(seq));
 388         return SEQ_START_TOKEN;
 389 }
 390
 391 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 392 {
 393         struct rtable *r;
 394
 395         if (v == SEQ_START_TOKEN)
 396                 r = rt_cache_get_first(seq);
 397         else
 398                 r = rt_cache_get_next(seq, v);
 399         ++*pos;
 400         return r;
 401 }
 402
 403 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
 404 {
 405         if (v && v != SEQ_START_TOKEN)
 406                 rcu_read_unlock_bh();
 407 }
 408
 409 static int rt_cache_seq_show(struct seq_file *seq, void *v)
 410 {
 411         if (v == SEQ_START_TOKEN)
 412                 seq_printf(seq, "%-127s\n",
 413                            "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
 414                            "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
 415                            "HHUptod\tSpecDst");
 416         else {
 417                 struct rtable *r = v;
 418                 struct neighbour *n;
 419                 int len;
 420
 421                 n = dst_get_neighbour(&r->dst);
 422                 seq_printf(seq, "%s\t%08X\t%08X\t%8X\t%d\t%u\t%d\t"
 423                               "%08X\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X%n",
 424                         r->dst.dev ? r->dst.dev->name : "*",
 425                         (__force u32)r->rt_dst,
 426                         (__force u32)r->rt_gateway,
 427                         r->rt_flags, atomic_read(&r->dst.__refcnt),
 428                         r->dst.__use, 0, (__force u32)r->rt_src,
 429                         dst_metric_advmss(&r->dst) + 40,
 430                         dst_metric(&r->dst, RTAX_WINDOW),
 431                         (int)((dst_metric(&r->dst, RTAX_RTT) >> 3) +
 432                               dst_metric(&r->dst, RTAX_RTTVAR)),
 433                         r->rt_key_tos,
 434                         -1,
 435                         (n && (n->nud_state & NUD_CONNECTED)) ? 1 : 0,
 436                         r->rt_spec_dst, &len);
 437
 438                 seq_printf(seq, "%*s\n", 127 - len, "");
 439         }
 440         return 0;
 441 }
 442
 443 static const struct seq_operations rt_cache_seq_ops = {
 444         .start  = rt_cache_seq_start,
 445         .next   = rt_cache_seq_next,
 446         .stop   = rt_cache_seq_stop,
 447         .show   = rt_cache_seq_show,
 448 };
 449
 450 static int rt_cache_seq_open(struct inode *inode, struct file *file)
 451 {
 452         return seq_open_net(inode, file, &rt_cache_seq_ops,
 453                         sizeof(struct rt_cache_iter_state));
 454 }
 455
 456 static const struct file_operations rt_cache_seq_fops = {
 457         .owner   = THIS_MODULE,
 458         .open    = rt_cache_seq_open,
 459         .read    = seq_read,
 460         .llseek  = seq_lseek,
 461         .release = seq_release_net,
 462 };
 463
 464
 465 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
 466 {
 467         int cpu;
 468
 469         if (*pos == 0)
 470                 return SEQ_START_TOKEN;
 471
 472         for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
 473                 if (!cpu_possible(cpu))
 474                         continue;
 475                 *pos = cpu+1;
 476                 return &per_cpu(rt_cache_stat, cpu);
 477         }
 478         return NULL;
 479 }
 480
 481 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 482 {
 483         int cpu;
 484
 485         for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
 486                 if (!cpu_possible(cpu))
 487                         continue;
 488                 *pos = cpu+1;
 489                 return &per_cpu(rt_cache_stat, cpu);
 490         }
 491         return NULL;
 492
 493 }
 494
 495 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
 496 {
 497
 498 }
 499
 500 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
 501 {
 502         struct rt_cache_stat *st = v;
 503
 504         if (v == SEQ_START_TOKEN) {
 505                 seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
 506                 return 0;
 507         }
 508
 509         seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
 510                    " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
 511                    dst_entries_get_slow(&ipv4_dst_ops),
 512                    st->in_hit,
 513                    st->in_slow_tot,
 514                    st->in_slow_mc,
 515                    st->in_no_route,
 516                    st->in_brd,
 517                    st->in_martian_dst,
 518                    st->in_martian_src,
 519
 520                    st->out_hit,
 521                    st->out_slow_tot,
 522                    st->out_slow_mc,
 523
 524                    st->gc_total,
 525                    st->gc_ignored,
 526                    st->gc_goal_miss,
 527                    st->gc_dst_overflow,
 528                    st->in_hlist_search,
 529                    st->out_hlist_search
 530                 );
 531         return 0;
 532 }
 533
 534 static const struct seq_operations rt_cpu_seq_ops = {
 535         .start  = rt_cpu_seq_start,
 536         .next   = rt_cpu_seq_next,
 537         .stop   = rt_cpu_seq_stop,
 538         .show   = rt_cpu_seq_show,
 539 };
 540
 541
 542 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
 543 {
 544         return seq_open(file, &rt_cpu_seq_ops);
 545 }
 546
 547 static const struct file_operations rt_cpu_seq_fops = {
 548         .owner   = THIS_MODULE,
 549         .open    = rt_cpu_seq_open,
 550         .read    = seq_read,
 551         .llseek  = seq_lseek,
 552         .release = seq_release,
 553 };
 554
 555 #ifdef CONFIG_IP_ROUTE_CLASSID
 556 static int rt_acct_proc_show(struct seq_file *m, void *v)
 557 {
 558         struct ip_rt_acct *dst, *src;
 559         unsigned int i, j;
 560
 561         dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
 562         if (!dst)
 563                 return -ENOMEM;
 564
 565         for_each_possible_cpu(i) {
 566                 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
 567                 for (j = 0; j < 256; j++) {
 568                         dst[j].o_bytes   += src[j].o_bytes;
 569                         dst[j].o_packets += src[j].o_packets;
 570                         dst[j].i_bytes   += src[j].i_bytes;
 571                         dst[j].i_packets += src[j].i_packets;
 572                 }
 573         }
 574
 575         seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
 576         kfree(dst);
 577         return 0;
 578 }
 579
 580 static int rt_acct_proc_open(struct inode *inode, struct file *file)
 581 {
 582         return single_open(file, rt_acct_proc_show, NULL);
 583 }
 584
 585 static const struct file_operations rt_acct_proc_fops = {
 586         .owner          = THIS_MODULE,
 587         .open           = rt_acct_proc_open,
 588         .read           = seq_read,
 589         .llseek         = seq_lseek,
 590         .release        = single_release,
 591 };
 592 #endif
 593
 594 static int __net_init ip_rt_do_proc_init(struct net *net)
 595 {
 596         struct proc_dir_entry *pde;
 597
 598         pde = proc_net_fops_create(net, "rt_cache", S_IRUGO,
 599                         &rt_cache_seq_fops);
 600         if (!pde)
 601                 goto err1;
 602
 603         pde = proc_create("rt_cache", S_IRUGO,
 604                           net->proc_net_stat, &rt_cpu_seq_fops);
 605         if (!pde)
 606                 goto err2;
 607
 608 #ifdef CONFIG_IP_ROUTE_CLASSID
 609         pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
 610         if (!pde)
 611                 goto err3;
 612 #endif
 613         return 0;
 614
 615 #ifdef CONFIG_IP_ROUTE_CLASSID
 616 err3:
 617         remove_proc_entry("rt_cache", net->proc_net_stat);
 618 #endif
 619 err2:
 620         remove_proc_entry("rt_cache", net->proc_net);
 621 err1:
 622         return -ENOMEM;
 623 }
 624
 625 static void __net_exit ip_rt_do_proc_exit(struct net *net)
 626 {
 627         remove_proc_entry("rt_cache", net->proc_net_stat);
 628         remove_proc_entry("rt_cache", net->proc_net);
 629 #ifdef CONFIG_IP_ROUTE_CLASSID
 630         remove_proc_entry("rt_acct", net->proc_net);
 631 #endif
 632 }
 633
 634 static struct pernet_operations ip_rt_proc_ops __net_initdata =  {
 635         .init = ip_rt_do_proc_init,
 636         .exit = ip_rt_do_proc_exit,
 637 };
 638
 639 static int __init ip_rt_proc_init(void)
 640 {
 641         return register_pernet_subsys(&ip_rt_proc_ops);
 642 }
 643
 644 #else
 645 static inline int ip_rt_proc_init(void)
 646 {
 647         return 0;
 648 }
 649 #endif /* CONFIG_PROC_FS */
 650
 651 static inline void rt_free(struct rtable *rt)
 652 {
 653         call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
 654 }
 655
 656 static inline void rt_drop(struct rtable *rt)
 657 {
 658         ip_rt_put(rt);
 659         call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
 660 }
 661
 662 static inline int rt_fast_clean(struct rtable *rth)
 663 {
 664         /* Kill broadcast/multicast entries very aggresively, if they
 665            collide in hash table with more useful entries */
 666         return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
 667                 rt_is_input_route(rth) && rth->dst.rt_next;
 668 }
 669
 670 static inline int rt_valuable(struct rtable *rth)
 671 {
 672         return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
 673                 (rth->peer && rth->peer->pmtu_expires);
 674 }
 675
 676 static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
 677 {
 678         unsigned long age;
 679         int ret = 0;
 680
 681         if (atomic_read(&rth->dst.__refcnt))
 682                 goto out;
 683
 684         age = jiffies - rth->dst.lastuse;
 685         if ((age <= tmo1 && !rt_fast_clean(rth)) ||
 686             (age <= tmo2 && rt_valuable(rth)))
 687                 goto out;
 688         ret = 1;
 689 out:    return ret;
 690 }
 691
 692 /* Bits of score are:
 693  * 31: very valuable
 694  * 30: not quite useless
 695  * 29..0: usage counter
 696  */
 697 static inline u32 rt_score(struct rtable *rt)
 698 {
 699         u32 score = jiffies - rt->dst.lastuse;
 700
 701         score = ~score & ~(3<<30);
 702
 703         if (rt_valuable(rt))
 704                 score |= (1<<31);
 705
 706         if (rt_is_output_route(rt) ||
 707             !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
 708                 score |= (1<<30);
 709
 710         return score;
 711 }
 712
 713 static inline bool rt_caching(const struct net *net)
 714 {
 715         return net->ipv4.current_rt_cache_rebuild_count <=
 716                 net->ipv4.sysctl_rt_cache_rebuild_count;
 717 }
 718
 719 static inline bool compare_hash_inputs(const struct rtable *rt1,
 720                                        const struct rtable *rt2)
 721 {
 722         return ((((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) |
 723                 ((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) |
 724                 (rt1->rt_route_iif ^ rt2->rt_route_iif)) == 0);
 725 }
 726
 727 static inline int compare_keys(struct rtable *rt1, struct rtable *rt2)
 728 {
 729         return (((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) |
 730                 ((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) |
 731                 (rt1->rt_mark ^ rt2->rt_mark) |
 732                 (rt1->rt_key_tos ^ rt2->rt_key_tos) |
 733                 (rt1->rt_route_iif ^ rt2->rt_route_iif) |
 734                 (rt1->rt_oif ^ rt2->rt_oif)) == 0;
 735 }
 736
 737 static inline int compare_netns(struct rtable *rt1, struct rtable *rt2)
 738 {
 739         return net_eq(dev_net(rt1->dst.dev), dev_net(rt2->dst.dev));
 740 }
 741
 742 static inline int rt_is_expired(struct rtable *rth)
 743 {
 744         return rth->rt_genid != rt_genid(dev_net(rth->dst.dev));
 745 }
 746
 747 /*
 748  * Perform a full scan of hash table and free all entries.
 749  * Can be called by a softirq or a process.
 750  * In the later case, we want to be reschedule if necessary
 751  */
 752 static void rt_do_flush(struct net *net, int process_context)
 753 {
 754         unsigned int i;
 755         struct rtable *rth, *next;
 756
 757         for (i = 0; i <= rt_hash_mask; i++) {
 758                 struct rtable __rcu **pprev;
 759                 struct rtable *list;
 760
 761                 if (process_context && need_resched())
 762                         cond_resched();
 763                 rth = rcu_access_pointer(rt_hash_table[i].chain);
 764                 if (!rth)
 765                         continue;
 766
 767                 spin_lock_bh(rt_hash_lock_addr(i));
 768
 769                 list = NULL;
 770                 pprev = &rt_hash_table[i].chain;
 771                 rth = rcu_dereference_protected(*pprev,
 772                         lockdep_is_held(rt_hash_lock_addr(i)));
 773
 774                 while (rth) {
 775                         next = rcu_dereference_protected(rth->dst.rt_next,
 776                                 lockdep_is_held(rt_hash_lock_addr(i)));
 777
 778                         if (!net ||
 779                             net_eq(dev_net(rth->dst.dev), net)) {
 780                                 rcu_assign_pointer(*pprev, next);
 781                                 rcu_assign_pointer(rth->dst.rt_next, list);
 782                                 list = rth;
 783                         } else {
 784                                 pprev = &rth->dst.rt_next;
 785                         }
 786                         rth = next;
 787                 }
 788
 789                 spin_unlock_bh(rt_hash_lock_addr(i));
 790
 791                 for (; list; list = next) {
 792                         next = rcu_dereference_protected(list->dst.rt_next, 1);
 793                         rt_free(list);
 794                 }
 795         }
 796 }
 797
 798 /*
 799  * While freeing expired entries, we compute average chain length
 800  * and standard deviation, using fixed-point arithmetic.
 801  * This to have an estimation of rt_chain_length_max
 802  *  rt_chain_length_max = max(elasticity, AVG + 4*SD)
 803  * We use 3 bits for frational part, and 29 (or 61) for magnitude.
 804  */
 805
 806 #define FRACT_BITS 3
 807 #define ONE (1UL << FRACT_BITS)
 808
 809 /*
 810  * Given a hash chain and an item in this hash chain,
 811  * find if a previous entry has the same hash_inputs
 812  * (but differs on tos, mark or oif)
 813  * Returns 0 if an alias is found.
 814  * Returns ONE if rth has no alias before itself.
 815  */
 816 static int has_noalias(const struct rtable *head, const struct rtable *rth)
 817 {
 818         const struct rtable *aux = head;
 819
 820         while (aux != rth) {
 821                 if (compare_hash_inputs(aux, rth))
 822                         return 0;
 823                 aux = rcu_dereference_protected(aux->dst.rt_next, 1);
 824         }
 825         return ONE;
 826 }
 827
 828 /*
 829  * Perturbation of rt_genid by a small quantity [1..256]
 830  * Using 8 bits of shuffling ensure we can call rt_cache_invalidate()
 831  * many times (2^24) without giving recent rt_genid.
 832  * Jenkins hash is strong enough that litle changes of rt_genid are OK.
 833  */
 834 static void rt_cache_invalidate(struct net *net)
 835 {
 836         unsigned char shuffle;
 837
 838         get_random_bytes(&shuffle, sizeof(shuffle));
 839         atomic_add(shuffle + 1U, &net->ipv4.rt_genid);
 840 }
 841
 842 /*
 843  * delay < 0  : invalidate cache (fast : entries will be deleted later)
 844  * delay >= 0 : invalidate & flush cache (can be long)
 845  */
 846 void rt_cache_flush(struct net *net, int delay)
 847 {
 848         rt_cache_invalidate(net);
 849         if (delay >= 0)
 850                 rt_do_flush(net, !in_softirq());
 851 }
 852
 853 /* Flush previous cache invalidated entries from the cache */
 854 void rt_cache_flush_batch(struct net *net)
 855 {
 856         rt_do_flush(net, !in_softirq());
 857 }
 858
 859 static void rt_emergency_hash_rebuild(struct net *net)
 860 {
 861         if (net_ratelimit())
 862                 printk(KERN_WARNING "Route hash chain too long!\n");
 863         rt_cache_invalidate(net);
 864 }
 865
 866 /*
 867    Short description of GC goals.
 868
 869    We want to build algorithm, which will keep routing cache
 870    at some equilibrium point, when number of aged off entries
 871    is kept approximately equal to newly generated ones.
 872
 873    Current expiration strength is variable "expire".
 874    We try to adjust it dynamically, so that if networking
 875    is idle expires is large enough to keep enough of warm entries,
 876    and when load increases it reduces to limit cache size.
 877  */
 878
 879 static int rt_garbage_collect(struct dst_ops *ops)
 880 {
 881         static unsigned long expire = RT_GC_TIMEOUT;
 882         static unsigned long last_gc;
 883         static int rover;
 884         static int equilibrium;
 885         struct rtable *rth;
 886         struct rtable __rcu **rthp;
 887         unsigned long now = jiffies;
 888         int goal;
 889         int entries = dst_entries_get_fast(&ipv4_dst_ops);
 890
 891         /*
 892          * Garbage collection is pretty expensive,
 893          * do not make it too frequently.
 894          */
 895
 896         RT_CACHE_STAT_INC(gc_total);
 897
 898         if (now - last_gc < ip_rt_gc_min_interval &&
 899             entries < ip_rt_max_size) {
 900                 RT_CACHE_STAT_INC(gc_ignored);
 901                 goto out;
 902         }
 903
 904         entries = dst_entries_get_slow(&ipv4_dst_ops);
 905         /* Calculate number of entries, which we want to expire now. */
 906         goal = entries - (ip_rt_gc_elasticity << rt_hash_log);
 907         if (goal <= 0) {
 908                 if (equilibrium < ipv4_dst_ops.gc_thresh)
 909                         equilibrium = ipv4_dst_ops.gc_thresh;
 910                 goal = entries - equilibrium;
 911                 if (goal > 0) {
 912                         equilibrium += min_t(unsigned int, goal >> 1, rt_hash_mask + 1);
 913                         goal = entries - equilibrium;
 914                 }
 915         } else {
 916                 /* We are in dangerous area. Try to reduce cache really
 917                  * aggressively.
 918                  */
 919                 goal = max_t(unsigned int, goal >> 1, rt_hash_mask + 1);
 920                 equilibrium = entries - goal;
 921         }
 922
 923         if (now - last_gc >= ip_rt_gc_min_interval)
 924                 last_gc = now;
 925
 926         if (goal <= 0) {
 927                 equilibrium += goal;
 928                 goto work_done;
 929         }
 930
 931         do {
 932                 int i, k;
 933
 934                 for (i = rt_hash_mask, k = rover; i >= 0; i--) {
 935                         unsigned long tmo = expire;
 936
 937                         k = (k + 1) & rt_hash_mask;
 938                         rthp = &rt_hash_table[k].chain;
 939                         spin_lock_bh(rt_hash_lock_addr(k));
 940                         while ((rth = rcu_dereference_protected(*rthp,
 941                                         lockdep_is_held(rt_hash_lock_addr(k)))) != NULL) {
 942                                 if (!rt_is_expired(rth) &&
 943                                         !rt_may_expire(rth, tmo, expire)) {
 944                                         tmo >>= 1;
 945                                         rthp = &rth->dst.rt_next;
 946                                         continue;
 947                                 }
 948                                 *rthp = rth->dst.rt_next;
 949                                 rt_free(rth);
 950                                 goal--;
 951                         }
 952                         spin_unlock_bh(rt_hash_lock_addr(k));
 953                         if (goal <= 0)
 954                                 break;
 955                 }
 956                 rover = k;
 957
 958                 if (goal <= 0)
 959                         goto work_done;
 960
 961                 /* Goal is not achieved. We stop process if:
 962
 963                    - if expire reduced to zero. Otherwise, expire is halfed.
 964                    - if table is not full.
 965                    - if we are called from interrupt.
 966                    - jiffies check is just fallback/debug loop breaker.
 967                      We will not spin here for long time in any case.
 968                  */
 969
 970                 RT_CACHE_STAT_INC(gc_goal_miss);
 971
 972                 if (expire == 0)
 973                         break;
 974
 975                 expire >>= 1;
 976
 977                 if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
 978                         goto out;
 979         } while (!in_softirq() && time_before_eq(jiffies, now));
 980
 981         if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
 982                 goto out;
 983         if (dst_entries_get_slow(&ipv4_dst_ops) < ip_rt_max_size)
 984                 goto out;
 985         if (net_ratelimit())
 986                 printk(KERN_WARNING "dst cache overflow\n");
 987         RT_CACHE_STAT_INC(gc_dst_overflow);
 988         return 1;
 989
 990 work_done:
 991         expire += ip_rt_gc_min_interval;
 992         if (expire > ip_rt_gc_timeout ||
 993             dst_entries_get_fast(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh ||
 994             dst_entries_get_slow(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh)
 995                 expire = ip_rt_gc_timeout;
 996 out:    return 0;
 997 }
 998
 999 /*
1000  * Returns number of entries in a hash chain that have different hash_inputs
1001  */
1002 static int slow_chain_length(const struct rtable *head)
1003 {
1004         int length = 0;
1005         const struct rtable *rth = head;
1006
1007         while (rth) {
1008                 length += has_noalias(head, rth);
1009                 rth = rcu_dereference_protected(rth->dst.rt_next, 1);
1010         }
1011         return length >> FRACT_BITS;
1012 }
1013
1014 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, const void *daddr)
1015 {
1016         struct neigh_table *tbl = &arp_tbl;
1017         static const __be32 inaddr_any = 0;
1018         struct net_device *dev = dst->dev;
1019         const __be32 *pkey = daddr;
1020         struct neighbour *n;
1021
1022 #if defined(CONFIG_ATM_CLIP) || defined(CONFIG_ATM_CLIP_MODULE)
1023         if (dev->type == ARPHRD_ATM)
1024                 tbl = clip_tbl_hook;
1025 #endif
1026         if (dev->flags & (IFF_LOOPBACK | IFF_POINTOPOINT))
1027                 pkey = &inaddr_any;
1028
1029         n = __ipv4_neigh_lookup(tbl, dev, *(__force u32 *)pkey);
1030         if (n)
1031                 return n;
1032         return neigh_create(tbl, pkey, dev);
1033 }
1034
1035 static int rt_bind_neighbour(struct rtable *rt)
1036 {
1037         struct neighbour *n = ipv4_neigh_lookup(&rt->dst, &rt->rt_gateway);
1038         if (IS_ERR(n))
1039                 return PTR_ERR(n);
1040         dst_set_neighbour(&rt->dst, n);
1041
1042         return 0;
1043 }
1044
1045 static struct rtable *rt_intern_hash(unsigned hash, struct rtable *rt,
1046                                      struct sk_buff *skb, int ifindex)
1047 {
1048         struct rtable   *rth, *cand;
1049         struct rtable __rcu **rthp, **candp;
1050         unsigned long   now;
1051         u32             min_score;
1052         int             chain_length;
1053         int attempts = !in_softirq();
1054
1055 restart:
1056         chain_length = 0;
1057         min_score = ~(u32)0;
1058         cand = NULL;
1059         candp = NULL;
1060         now = jiffies;
1061
1062         if (!rt_caching(dev_net(rt->dst.dev))) {
1063                 /*
1064                  * If we're not caching, just tell the caller we
1065                  * were successful and don't touch the route.  The
1066                  * caller hold the sole reference to the cache entry, and
1067                  * it will be released when the caller is done with it.
1068                  * If we drop it here, the callers have no way to resolve routes
1069                  * when we're not caching.  Instead, just point *rp at rt, so
1070                  * the caller gets a single use out of the route
1071                  * Note that we do rt_free on this new route entry, so that
1072                  * once its refcount hits zero, we are still able to reap it
1073                  * (Thanks Alexey)
1074                  * Note: To avoid expensive rcu stuff for this uncached dst,
1075                  * we set DST_NOCACHE so that dst_release() can free dst without
1076                  * waiting a grace period.
1077                  */
1078
1079                 rt->dst.flags |= DST_NOCACHE;
1080                 if (rt->rt_type == RTN_UNICAST || rt_is_output_route(rt)) {
1081                         int err = rt_bind_neighbour(rt);
1082                         if (err) {
1083                                 if (net_ratelimit())
1084                                         printk(KERN_WARNING
1085                                             "Neighbour table failure & not caching routes.\n");
1086                                 ip_rt_put(rt);
1087                                 return ERR_PTR(err);
1088                         }
1089                 }
1090
1091                 goto skip_hashing;
1092         }
1093
1094         rthp = &rt_hash_table[hash].chain;
1095
1096         spin_lock_bh(rt_hash_lock_addr(hash));
1097         while ((rth = rcu_dereference_protected(*rthp,
1098                         lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
1099                 if (rt_is_expired(rth)) {
1100                         *rthp = rth->dst.rt_next;
1101                         rt_free(rth);
1102                         continue;
1103                 }
1104                 if (compare_keys(rth, rt) && compare_netns(rth, rt)) {
1105                         /* Put it first */
1106                         *rthp = rth->dst.rt_next;
1107                         /*
1108                          * Since lookup is lockfree, the deletion
1109                          * must be visible to another weakly ordered CPU before
1110                          * the insertion at the start of the hash chain.
1111                          */
1112                         rcu_assign_pointer(rth->dst.rt_next,
1113                                            rt_hash_table[hash].chain);
1114                         /*
1115                          * Since lookup is lockfree, the update writes
1116                          * must be ordered for consistency on SMP.
1117                          */
1118                         rcu_assign_pointer(rt_hash_table[hash].chain, rth);
1119
1120                         dst_use(&rth->dst, now);
1121                         spin_unlock_bh(rt_hash_lock_addr(hash));
1122
1123                         rt_drop(rt);
1124                         if (skb)
1125                                 skb_dst_set(skb, &rth->dst);
1126                         return rth;
1127                 }
1128
1129                 if (!atomic_read(&rth->dst.__refcnt)) {
1130                         u32 score = rt_score(rth);
1131
1132                         if (score <= min_score) {
1133                                 cand = rth;
1134                                 candp = rthp;
1135                                 min_score = score;
1136                         }
1137                 }
1138
1139                 chain_length++;
1140
1141                 rthp = &rth->dst.rt_next;
1142         }
1143
1144         if (cand) {
1145                 /* ip_rt_gc_elasticity used to be average length of chain
1146                  * length, when exceeded gc becomes really aggressive.
1147                  *
1148                  * The second limit is less certain. At the moment it allows
1149                  * only 2 entries per bucket. We will see.
1150                  */
1151                 if (chain_length > ip_rt_gc_elasticity) {
1152                         *candp = cand->dst.rt_next;
1153                         rt_free(cand);
1154                 }
1155         } else {
1156                 if (chain_length > rt_chain_length_max &&
1157                     slow_chain_length(rt_hash_table[hash].chain) > rt_chain_length_max) {
1158                         struct net *net = dev_net(rt->dst.dev);
1159                         int num = ++net->ipv4.current_rt_cache_rebuild_count;
1160                         if (!rt_caching(net)) {
1161                                 printk(KERN_WARNING "%s: %d rebuilds is over limit, route caching disabled\n",
1162                                         rt->dst.dev->name, num);
1163                         }
1164                         rt_emergency_hash_rebuild(net);
1165                         spin_unlock_bh(rt_hash_lock_addr(hash));
1166
1167                         hash = rt_hash(rt->rt_key_dst, rt->rt_key_src,
1168                                         ifindex, rt_genid(net));
1169                         goto restart;
1170                 }
1171         }
1172
1173         /* Try to bind route to arp only if it is output
1174            route or unicast forwarding path.
1175          */
1176         if (rt->rt_type == RTN_UNICAST || rt_is_output_route(rt)) {
1177                 int err = rt_bind_neighbour(rt);
1178                 if (err) {
1179                         spin_unlock_bh(rt_hash_lock_addr(hash));
1180
1181                         if (err != -ENOBUFS) {
1182                                 rt_drop(rt);
1183                                 return ERR_PTR(err);
1184                         }
1185
1186                         /* Neighbour tables are full and nothing
1187                            can be released. Try to shrink route cache,
1188                            it is most likely it holds some neighbour records.
1189                          */
1190                         if (attempts-- > 0) {
1191                                 int saved_elasticity = ip_rt_gc_elasticity;
1192                                 int saved_int = ip_rt_gc_min_interval;
1193                                 ip_rt_gc_elasticity     = 1;
1194                                 ip_rt_gc_min_interval   = 0;
1195                                 rt_garbage_collect(&ipv4_dst_ops);
1196                                 ip_rt_gc_min_interval   = saved_int;
1197                                 ip_rt_gc_elasticity     = saved_elasticity;
1198                                 goto restart;
1199                         }
1200
1201                         if (net_ratelimit())
1202                                 printk(KERN_WARNING "ipv4: Neighbour table overflow.\n");
1203                         rt_drop(rt);
1204                         return ERR_PTR(-ENOBUFS);
1205                 }
1206         }
1207
1208         rt->dst.rt_next = rt_hash_table[hash].chain;
1209
1210         /*
1211          * Since lookup is lockfree, we must make sure
1212          * previous writes to rt are committed to memory
1213          * before making rt visible to other CPUS.
1214          */
1215         rcu_assign_pointer(rt_hash_table[hash].chain, rt);
1216
1217         spin_unlock_bh(rt_hash_lock_addr(hash));
1218
1219 skip_hashing:
1220         if (skb)
1221                 skb_dst_set(skb, &rt->dst);
1222         return rt;
1223 }
1224
1225 static atomic_t __rt_peer_genid = ATOMIC_INIT(0);
1226
1227 static u32 rt_peer_genid(void)
1228 {
1229         return atomic_read(&__rt_peer_genid);
1230 }
1231
1232 void rt_bind_peer(struct rtable *rt, __be32 daddr, int create)
1233 {
1234         struct inet_peer *peer;
1235
1236         peer = inet_getpeer_v4(daddr, create);
1237
1238         if (peer && cmpxchg(&rt->peer, NULL, peer) != NULL)
1239                 inet_putpeer(peer);
1240         else
1241                 rt->rt_peer_genid = rt_peer_genid();
1242 }
1243
1244 /*
1245  * Peer allocation may fail only in serious out-of-memory conditions.  However
1246  * we still can generate some output.
1247  * Random ID selection looks a bit dangerous because we have no chances to
1248  * select ID being unique in a reasonable period of time.
1249  * But broken packet identifier may be better than no packet at all.
1250  */
1251 static void ip_select_fb_ident(struct iphdr *iph)
1252 {
1253         static DEFINE_SPINLOCK(ip_fb_id_lock);
1254         static u32 ip_fallback_id;
1255         u32 salt;
1256
1257         spin_lock_bh(&ip_fb_id_lock);
1258         salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
1259         iph->id = htons(salt & 0xFFFF);
1260         ip_fallback_id = salt;
1261         spin_unlock_bh(&ip_fb_id_lock);
1262 }
1263
1264 void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
1265 {
1266         struct rtable *rt = (struct rtable *) dst;
1267
1268         if (rt) {
1269                 if (rt->peer == NULL)
1270                         rt_bind_peer(rt, rt->rt_dst, 1);
1271
1272                 /* If peer is attached to destination, it is never detached,
1273                    so that we need not to grab a lock to dereference it.
1274                  */
1275                 if (rt->peer) {
1276                         iph->id = htons(inet_getid(rt->peer, more));
1277                         return;
1278                 }
1279         } else
1280                 printk(KERN_DEBUG "rt_bind_peer(0) @%p\n",
1281                        __builtin_return_address(0));
1282
1283         ip_select_fb_ident(iph);
1284 }
1285 EXPORT_SYMBOL(__ip_select_ident);
1286
1287 static void rt_del(unsigned hash, struct rtable *rt)
1288 {
1289         struct rtable __rcu **rthp;
1290         struct rtable *aux;
1291
1292         rthp = &rt_hash_table[hash].chain;
1293         spin_lock_bh(rt_hash_lock_addr(hash));
1294         ip_rt_put(rt);
1295         while ((aux = rcu_dereference_protected(*rthp,
1296                         lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
1297                 if (aux == rt || rt_is_expired(aux)) {
1298                         *rthp = aux->dst.rt_next;
1299                         rt_free(aux);
1300                         continue;
1301                 }
1302                 rthp = &aux->dst.rt_next;
1303         }
1304         spin_unlock_bh(rt_hash_lock_addr(hash));
1305 }
1306
1307 static int check_peer_redir(struct dst_entry *dst, struct inet_peer *peer)
1308 {
1309         struct rtable *rt = (struct rtable *) dst;
1310         __be32 orig_gw = rt->rt_gateway;
1311         struct neighbour *n, *old_n;
1312
1313         dst_confirm(&rt->dst);
1314
1315         rt->rt_gateway = peer->redirect_learned.a4;
1316
1317         n = ipv4_neigh_lookup(&rt->dst, &rt->rt_gateway);
1318         if (IS_ERR(n))
1319                 return PTR_ERR(n);
1320         old_n = xchg(&rt->dst._neighbour, n);
1321         if (old_n)
1322                 neigh_release(old_n);
1323         if (!n || !(n->nud_state & NUD_VALID)) {
1324                 if (n)
1325                         neigh_event_send(n, NULL);
1326                 rt->rt_gateway = orig_gw;
1327                 return -EAGAIN;
1328         } else {
1329                 rt->rt_flags |= RTCF_REDIRECTED;
1330                 call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
1331         }
1332         return 0;
1333 }
1334
1335 /* called in rcu_read_lock() section */
1336 void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1337                     __be32 saddr, struct net_device *dev)
1338 {
1339         int s, i;
1340         struct in_device *in_dev = __in_dev_get_rcu(dev);
1341         __be32 skeys[2] = { saddr, 0 };
1342         int    ikeys[2] = { dev->ifindex, 0 };
1343         struct inet_peer *peer;
1344         struct net *net;
1345
1346         if (!in_dev)
1347                 return;
1348
1349         net = dev_net(dev);
1350         if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
1351             ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
1352             ipv4_is_zeronet(new_gw))
1353                 goto reject_redirect;
1354
1355         if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1356                 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1357                         goto reject_redirect;
1358                 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1359                         goto reject_redirect;
1360         } else {
1361                 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
1362                         goto reject_redirect;
1363         }
1364
1365         for (s = 0; s < 2; s++) {
1366                 for (i = 0; i < 2; i++) {
1367                         unsigned int hash;
1368                         struct rtable __rcu **rthp;
1369                         struct rtable *rt;
1370
1371                         hash = rt_hash(daddr, skeys[s], ikeys[i], rt_genid(net));
1372
1373                         rthp = &rt_hash_table[hash].chain;
1374
1375                         while ((rt = rcu_dereference(*rthp)) != NULL) {
1376                                 rthp = &rt->dst.rt_next;
1377
1378                                 if (rt->rt_key_dst != daddr ||
1379                                     rt->rt_key_src != skeys[s] ||
1380                                     rt->rt_oif != ikeys[i] ||
1381                                     rt_is_input_route(rt) ||
1382                                     rt_is_expired(rt) ||
1383                                     !net_eq(dev_net(rt->dst.dev), net) ||
1384                                     rt->dst.error ||
1385                                     rt->dst.dev != dev ||
1386                                     rt->rt_gateway != old_gw)
1387                                         continue;
1388
1389                                 if (!rt->peer)
1390                                         rt_bind_peer(rt, rt->rt_dst, 1);
1391
1392                                 peer = rt->peer;
1393                                 if (peer) {
1394                                         if (peer->redirect_learned.a4 != new_gw) {
1395                                                 peer->redirect_learned.a4 = new_gw;
1396                                                 atomic_inc(&__rt_peer_genid);
1397                                         }
1398                                         check_peer_redir(&rt->dst, peer);
1399                                 }
1400                         }
1401                 }
1402         }
1403         return;
1404
1405 reject_redirect:
1406 #ifdef CONFIG_IP_ROUTE_VERBOSE
1407         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1408                 printk(KERN_INFO "Redirect from %pI4 on %s about %pI4 ignored.\n"
1409                         "  Advised path = %pI4 -> %pI4\n",
1410                        &old_gw, dev->name, &new_gw,
1411                        &saddr, &daddr);
1412 #endif
1413         ;
1414 }
1415
1416 static bool peer_pmtu_expired(struct inet_peer *peer)
1417 {
1418         unsigned long orig = ACCESS_ONCE(peer->pmtu_expires);
1419
1420         return orig &&
1421                time_after_eq(jiffies, orig) &&
1422                cmpxchg(&peer->pmtu_expires, orig, 0) == orig;
1423 }
1424
1425 static bool peer_pmtu_cleaned(struct inet_peer *peer)
1426 {
1427         unsigned long orig = ACCESS_ONCE(peer->pmtu_expires);
1428
1429         return orig &&
1430                cmpxchg(&peer->pmtu_expires, orig, 0) == orig;
1431 }
1432
1433 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1434 {
1435         struct rtable *rt = (struct rtable *)dst;
1436         struct dst_entry *ret = dst;
1437
1438         if (rt) {
1439                 if (dst->obsolete > 0) {
1440                         ip_rt_put(rt);
1441                         ret = NULL;
1442                 } else if (rt->rt_flags & RTCF_REDIRECTED) {
1443                         unsigned hash = rt_hash(rt->rt_key_dst, rt->rt_key_src,
1444                                                 rt->rt_oif,
1445                                                 rt_genid(dev_net(dst->dev)));
1446                         rt_del(hash, rt);
1447                         ret = NULL;
1448                 } else if (rt->peer && peer_pmtu_expired(rt->peer)) {
1449                         dst_metric_set(dst, RTAX_MTU, rt->peer->pmtu_orig);
1450                 }
1451         }
1452         return ret;
1453 }
1454
1455 /*
1456  * Algorithm:
1457  *      1. The first ip_rt_redirect_number redirects are sent
1458  *         with exponential backoff, then we stop sending them at all,
1459  *         assuming that the host ignores our redirects.
1460  *      2. If we did not see packets requiring redirects
1461  *         during ip_rt_redirect_silence, we assume that the host
1462  *         forgot redirected route and start to send redirects again.
1463  *
1464  * This algorithm is much cheaper and more intelligent than dumb load limiting
1465  * in icmp.c.
1466  *
1467  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1468  * and "frag. need" (breaks PMTU discovery) in icmp.c.
1469  */
1470
1471 void ip_rt_send_redirect(struct sk_buff *skb)
1472 {
1473         struct rtable *rt = skb_rtable(skb);
1474         struct in_device *in_dev;
1475         struct inet_peer *peer;
1476         int log_martians;
1477
1478         rcu_read_lock();
1479         in_dev = __in_dev_get_rcu(rt->dst.dev);
1480         if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
1481                 rcu_read_unlock();
1482                 return;
1483         }
1484         log_martians = IN_DEV_LOG_MARTIANS(in_dev);
1485         rcu_read_unlock();
1486
1487         if (!rt->peer)
1488                 rt_bind_peer(rt, rt->rt_dst, 1);
1489         peer = rt->peer;
1490         if (!peer) {
1491                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1492                 return;
1493         }
1494
1495         /* No redirected packets during ip_rt_redirect_silence;
1496          * reset the algorithm.
1497          */
1498         if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence))
1499                 peer->rate_tokens = 0;
1500
1501         /* Too many ignored redirects; do not send anything
1502          * set dst.rate_last to the last seen redirected packet.
1503          */
1504         if (peer->rate_tokens >= ip_rt_redirect_number) {
1505                 peer->rate_last = jiffies;
1506                 return;
1507         }
1508
1509         /* Check for load limit; set rate_last to the latest sent
1510          * redirect.
1511          */
1512         if (peer->rate_tokens == 0 ||
1513             time_after(jiffies,
1514                        (peer->rate_last +
1515                         (ip_rt_redirect_load << peer->rate_tokens)))) {
1516                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1517                 peer->rate_last = jiffies;
1518                 ++peer->rate_tokens;
1519 #ifdef CONFIG_IP_ROUTE_VERBOSE
1520                 if (log_martians &&
1521                     peer->rate_tokens == ip_rt_redirect_number &&
1522                     net_ratelimit())
1523                         printk(KERN_WARNING "host %pI4/if%d ignores redirects for %pI4 to %pI4.\n",
1524                                &ip_hdr(skb)->saddr, rt->rt_iif,
1525                                 &rt->rt_dst, &rt->rt_gateway);
1526 #endif
1527         }
1528 }
1529
1530 static int ip_error(struct sk_buff *skb)
1531 {
1532         struct rtable *rt = skb_rtable(skb);
1533         struct inet_peer *peer;
1534         unsigned long now;
1535         bool send;
1536         int code;
1537
1538         switch (rt->dst.error) {
1539         case EINVAL:
1540         default:
1541                 goto out;
1542         case EHOSTUNREACH:
1543                 code = ICMP_HOST_UNREACH;
1544                 break;
1545         case ENETUNREACH:
1546                 code = ICMP_NET_UNREACH;
1547                 IP_INC_STATS_BH(dev_net(rt->dst.dev),
1548                                 IPSTATS_MIB_INNOROUTES);
1549                 break;
1550         case EACCES:
1551                 code = ICMP_PKT_FILTERED;
1552                 break;
1553         }
1554
1555         if (!rt->peer)
1556                 rt_bind_peer(rt, rt->rt_dst, 1);
1557         peer = rt->peer;
1558
1559         send = true;
1560         if (peer) {
1561                 now = jiffies;
1562                 peer->rate_tokens += now - peer->rate_last;
1563                 if (peer->rate_tokens > ip_rt_error_burst)
1564                         peer->rate_tokens = ip_rt_error_burst;
1565                 peer->rate_last = now;
1566                 if (peer->rate_tokens >= ip_rt_error_cost)
1567                         peer->rate_tokens -= ip_rt_error_cost;
1568                 else
1569                         send = false;
1570         }
1571         if (send)
1572                 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1573
1574 out:    kfree_skb(skb);
1575         return 0;
1576 }
1577
1578 /*
1579  *      The last two values are not from the RFC but
1580  *      are needed for AMPRnet AX.25 paths.
1581  */
1582
1583 static const unsigned short mtu_plateau[] =
1584 {32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1585
1586 static inline unsigned short guess_mtu(unsigned short old_mtu)
1587 {
1588         int i;
1589
1590         for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
1591                 if (old_mtu > mtu_plateau[i])
1592                         return mtu_plateau[i];
1593         return 68;
1594 }
1595
1596 unsigned short ip_rt_frag_needed(struct net *net, const struct iphdr *iph,
1597                                  unsigned short new_mtu,
1598                                  struct net_device *dev)
1599 {
1600         unsigned short old_mtu = ntohs(iph->tot_len);
1601         unsigned short est_mtu = 0;
1602         struct inet_peer *peer;
1603
1604         peer = inet_getpeer_v4(iph->daddr, 1);
1605         if (peer) {
1606                 unsigned short mtu = new_mtu;
1607
1608                 if (new_mtu < 68 || new_mtu >= old_mtu) {
1609                         /* BSD 4.2 derived systems incorrectly adjust
1610                          * tot_len by the IP header length, and report
1611                          * a zero MTU in the ICMP message.
1612                          */
1613                         if (mtu == 0 &&
1614                             old_mtu >= 68 + (iph->ihl << 2))
1615                                 old_mtu -= iph->ihl << 2;
1616                         mtu = guess_mtu(old_mtu);
1617                 }
1618
1619                 if (mtu < ip_rt_min_pmtu)
1620                         mtu = ip_rt_min_pmtu;
1621                 if (!peer->pmtu_expires || mtu < peer->pmtu_learned) {
1622                         unsigned long pmtu_expires;
1623
1624                         pmtu_expires = jiffies + ip_rt_mtu_expires;
1625                         if (!pmtu_expires)
1626                                 pmtu_expires = 1UL;
1627
1628                         est_mtu = mtu;
1629                         peer->pmtu_learned = mtu;
1630                         peer->pmtu_expires = pmtu_expires;
1631                         atomic_inc(&__rt_peer_genid);
1632                 }
1633
1634                 inet_putpeer(peer);
1635         }
1636         return est_mtu ? : new_mtu;
1637 }
1638
1639 static void check_peer_pmtu(struct dst_entry *dst, struct inet_peer *peer)
1640 {
1641         unsigned long expires = ACCESS_ONCE(peer->pmtu_expires);
1642
1643         if (!expires)
1644                 return;
1645         if (time_before(jiffies, expires)) {
1646                 u32 orig_dst_mtu = dst_mtu(dst);
1647                 if (peer->pmtu_learned < orig_dst_mtu) {
1648                         if (!peer->pmtu_orig)
1649                                 peer->pmtu_orig = dst_metric_raw(dst, RTAX_MTU);
1650                         dst_metric_set(dst, RTAX_MTU, peer->pmtu_learned);
1651                 }
1652         } else if (cmpxchg(&peer->pmtu_expires, expires, 0) == expires)
1653                 dst_metric_set(dst, RTAX_MTU, peer->pmtu_orig);
1654 }
1655
1656 static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1657 {
1658         struct rtable *rt = (struct rtable *) dst;
1659         struct inet_peer *peer;
1660
1661         dst_confirm(dst);
1662
1663         if (!rt->peer)
1664                 rt_bind_peer(rt, rt->rt_dst, 1);
1665         peer = rt->peer;
1666         if (peer) {
1667                 unsigned long pmtu_expires = ACCESS_ONCE(peer->pmtu_expires);
1668
1669                 if (mtu < ip_rt_min_pmtu)
1670                         mtu = ip_rt_min_pmtu;
1671                 if (!pmtu_expires || mtu < peer->pmtu_learned) {
1672
1673                         pmtu_expires = jiffies + ip_rt_mtu_expires;
1674                         if (!pmtu_expires)
1675                                 pmtu_expires = 1UL;
1676
1677                         peer->pmtu_learned = mtu;
1678                         peer->pmtu_expires = pmtu_expires;
1679
1680                         atomic_inc(&__rt_peer_genid);
1681                         rt->rt_peer_genid = rt_peer_genid();
1682                 }
1683                 check_peer_pmtu(dst, peer);
1684         }
1685 }
1686
1687
1688 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1689 {
1690         struct rtable *rt = (struct rtable *) dst;
1691
1692         if (rt_is_expired(rt))
1693                 return NULL;
1694         if (rt->rt_peer_genid != rt_peer_genid()) {
1695                 struct inet_peer *peer;
1696
1697                 if (!rt->peer)
1698                         rt_bind_peer(rt, rt->rt_dst, 0);
1699
1700                 peer = rt->peer;
1701                 if (peer) {
1702                         check_peer_pmtu(dst, peer);
1703
1704                         if (peer->redirect_learned.a4 &&
1705                             peer->redirect_learned.a4 != rt->rt_gateway) {
1706                                 if (check_peer_redir(dst, peer))
1707                                         return NULL;
1708                         }
1709                 }
1710
1711                 rt->rt_peer_genid = rt_peer_genid();
1712         }
1713         return dst;
1714 }
1715
1716 static void ipv4_dst_destroy(struct dst_entry *dst)
1717 {
1718         struct rtable *rt = (struct rtable *) dst;
1719         struct inet_peer *peer = rt->peer;
1720
1721         if (rt->fi) {
1722                 fib_info_put(rt->fi);
1723                 rt->fi = NULL;
1724         }
1725         if (peer) {
1726                 rt->peer = NULL;
1727                 inet_putpeer(peer);
1728         }
1729 }
1730
1731
1732 static void ipv4_link_failure(struct sk_buff *skb)
1733 {
1734         struct rtable *rt;
1735
1736         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1737
1738         rt = skb_rtable(skb);
1739         if (rt && rt->peer && peer_pmtu_cleaned(rt->peer))
1740                 dst_metric_set(&rt->dst, RTAX_MTU, rt->peer->pmtu_orig);
1741 }
1742
1743 static int ip_rt_bug(struct sk_buff *skb)
1744 {
1745         printk(KERN_DEBUG "ip_rt_bug: %pI4 -> %pI4, %s\n",
1746                 &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1747                 skb->dev ? skb->dev->name : "?");
1748         kfree_skb(skb);
1749         WARN_ON(1);
1750         return 0;
1751 }
1752
1753 /*
1754    We do not cache source address of outgoing interface,
1755    because it is used only by IP RR, TS and SRR options,
1756    so that it out of fast path.
1757
1758    BTW remember: "addr" is allowed to be not aligned
1759    in IP options!
1760  */
1761
1762 void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1763 {
1764         __be32 src;
1765
1766         if (rt_is_output_route(rt))
1767                 src = ip_hdr(skb)->saddr;
1768         else {
1769                 struct fib_result res;
1770                 struct flowi4 fl4;
1771                 struct iphdr *iph;
1772
1773                 iph = ip_hdr(skb);
1774
1775                 memset(&fl4, 0, sizeof(fl4));
1776                 fl4.daddr = iph->daddr;
1777                 fl4.saddr = iph->saddr;
1778                 fl4.flowi4_tos = RT_TOS(iph->tos);
1779                 fl4.flowi4_oif = rt->dst.dev->ifindex;
1780                 fl4.flowi4_iif = skb->dev->ifindex;
1781                 fl4.flowi4_mark = skb->mark;
1782
1783                 rcu_read_lock();
1784                 if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res) == 0)
1785                         src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res);
1786                 else
1787                         src = inet_select_addr(rt->dst.dev, rt->rt_gateway,
1788                                         RT_SCOPE_UNIVERSE);
1789                 rcu_read_unlock();
1790         }
1791         memcpy(addr, &src, 4);
1792 }
1793
1794 #ifdef CONFIG_IP_ROUTE_CLASSID
1795 static void set_class_tag(struct rtable *rt, u32 tag)
1796 {
1797         if (!(rt->dst.tclassid & 0xFFFF))
1798                 rt->dst.tclassid |= tag & 0xFFFF;
1799         if (!(rt->dst.tclassid & 0xFFFF0000))
1800                 rt->dst.tclassid |= tag & 0xFFFF0000;
1801 }
1802 #endif
1803
1804 static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1805 {
1806         unsigned int advmss = dst_metric_raw(dst, RTAX_ADVMSS);
1807
1808         if (advmss == 0) {
1809                 advmss = max_t(unsigned int, dst->dev->mtu - 40,
1810                                ip_rt_min_advmss);
1811                 if (advmss > 65535 - 40)
1812                         advmss = 65535 - 40;
1813         }
1814         return advmss;
1815 }
1816
1817 static unsigned int ipv4_default_mtu(const struct dst_entry *dst)
1818 {
1819         unsigned int mtu = dst->dev->mtu;
1820
1821         if (unlikely(dst_metric_locked(dst, RTAX_MTU))) {
1822                 const struct rtable *rt = (const struct rtable *) dst;
1823
1824                 if (rt->rt_gateway != rt->rt_dst && mtu > 576)
1825                         mtu = 576;
1826         }
1827
1828         if (mtu > IP_MAX_MTU)
1829                 mtu = IP_MAX_MTU;
1830
1831         return mtu;
1832 }
1833
1834 static void rt_init_metrics(struct rtable *rt, const struct flowi4 *fl4,
1835                             struct fib_info *fi)
1836 {
1837         struct inet_peer *peer;
1838         int create = 0;
1839
1840         /* If a peer entry exists for this destination, we must hook
1841          * it up in order to get at cached metrics.
1842          */
1843         if (fl4 && (fl4->flowi4_flags & FLOWI_FLAG_PRECOW_METRICS))
1844                 create = 1;
1845
1846         rt->peer = peer = inet_getpeer_v4(rt->rt_dst, create);
1847         if (peer) {
1848                 rt->rt_peer_genid = rt_peer_genid();
1849                 if (inet_metrics_new(peer))
1850                         memcpy(peer->metrics, fi->fib_metrics,
1851                                sizeof(u32) * RTAX_MAX);
1852                 dst_init_metrics(&rt->dst, peer->metrics, false);
1853
1854                 check_peer_pmtu(&rt->dst, peer);
1855                 if (peer->redirect_learned.a4 &&
1856                     peer->redirect_learned.a4 != rt->rt_gateway) {
1857                         rt->rt_gateway = peer->redirect_learned.a4;
1858                         rt->rt_flags |= RTCF_REDIRECTED;
1859                 }
1860         } else {
1861                 if (fi->fib_metrics != (u32 *) dst_default_metrics) {
1862                         rt->fi = fi;
1863                         atomic_inc(&fi->fib_clntref);
1864                 }
1865                 dst_init_metrics(&rt->dst, fi->fib_metrics, true);
1866         }
1867 }
1868
1869 static void rt_set_nexthop(struct rtable *rt, const struct flowi4 *fl4,
1870                            const struct fib_result *res,
1871                            struct fib_info *fi, u16 type, u32 itag)
1872 {
1873         struct dst_entry *dst = &rt->dst;
1874
1875         if (fi) {
1876                 if (FIB_RES_GW(*res) &&
1877                     FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1878                         rt->rt_gateway = FIB_RES_GW(*res);
1879                 rt_init_metrics(rt, fl4, fi);
1880 #ifdef CONFIG_IP_ROUTE_CLASSID
1881                 dst->tclassid = FIB_RES_NH(*res).nh_tclassid;
1882 #endif
1883         }
1884
1885         if (dst_mtu(dst) > IP_MAX_MTU)
1886                 dst_metric_set(dst, RTAX_MTU, IP_MAX_MTU);
1887         if (dst_metric_raw(dst, RTAX_ADVMSS) > 65535 - 40)
1888                 dst_metric_set(dst, RTAX_ADVMSS, 65535 - 40);
1889
1890 #ifdef CONFIG_IP_ROUTE_CLASSID
1891 #ifdef CONFIG_IP_MULTIPLE_TABLES
1892         set_class_tag(rt, fib_rules_tclass(res));
1893 #endif
1894         set_class_tag(rt, itag);
1895 #endif
1896 }
1897
1898 static struct rtable *rt_dst_alloc(struct net_device *dev,
1899                                    bool nopolicy, bool noxfrm)
1900 {
1901         return dst_alloc(&ipv4_dst_ops, dev, 1, -1,
1902                          DST_HOST |
1903                          (nopolicy ? DST_NOPOLICY : 0) |
1904                          (noxfrm ? DST_NOXFRM : 0));
1905 }
1906
1907 /* called in rcu_read_lock() section */
1908 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1909                                 u8 tos, struct net_device *dev, int our)
1910 {
1911         unsigned int hash;
1912         struct rtable *rth;
1913         __be32 spec_dst;
1914         struct in_device *in_dev = __in_dev_get_rcu(dev);
1915         u32 itag = 0;
1916         int err;
1917
1918         /* Primary sanity checks. */
1919
1920         if (in_dev == NULL)
1921                 return -EINVAL;
1922
1923         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1924             ipv4_is_loopback(saddr) || skb->protocol != htons(ETH_P_IP))
1925                 goto e_inval;
1926
1927         if (ipv4_is_zeronet(saddr)) {
1928                 if (!ipv4_is_local_multicast(daddr))
1929                         goto e_inval;
1930                 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1931         } else {
1932                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev, &spec_dst,
1933                                           &itag);
1934                 if (err < 0)
1935                         goto e_err;
1936         }
1937         rth = rt_dst_alloc(init_net.loopback_dev,
1938                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
1939         if (!rth)
1940                 goto e_nobufs;
1941
1942 #ifdef CONFIG_IP_ROUTE_CLASSID
1943         rth->dst.tclassid = itag;
1944 #endif
1945         rth->dst.output = ip_rt_bug;
1946
1947         rth->rt_key_dst = daddr;
1948         rth->rt_key_src = saddr;
1949         rth->rt_genid   = rt_genid(dev_net(dev));
1950         rth->rt_flags   = RTCF_MULTICAST;
1951         rth->rt_type    = RTN_MULTICAST;
1952         rth->rt_key_tos = tos;
1953         rth->rt_dst     = daddr;
1954         rth->rt_src     = saddr;
1955         rth->rt_route_iif = dev->ifindex;
1956         rth->rt_iif     = dev->ifindex;
1957         rth->rt_oif     = 0;
1958         rth->rt_mark    = skb->mark;
1959         rth->rt_gateway = daddr;
1960         rth->rt_spec_dst= spec_dst;
1961         rth->rt_peer_genid = 0;
1962         rth->peer = NULL;
1963         rth->fi = NULL;
1964         if (our) {
1965                 rth->dst.input= ip_local_deliver;
1966                 rth->rt_flags |= RTCF_LOCAL;
1967         }
1968
1969 #ifdef CONFIG_IP_MROUTE
1970         if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1971                 rth->dst.input = ip_mr_input;
1972 #endif
1973         RT_CACHE_STAT_INC(in_slow_mc);
1974
1975         hash = rt_hash(daddr, saddr, dev->ifindex, rt_genid(dev_net(dev)));
1976         rth = rt_intern_hash(hash, rth, skb, dev->ifindex);
1977         return IS_ERR(rth) ? PTR_ERR(rth) : 0;
1978
1979 e_nobufs:
1980         return -ENOBUFS;
1981 e_inval:
1982         return -EINVAL;
1983 e_err:
1984         return err;
1985 }
1986
1987
1988 static void ip_handle_martian_source(struct net_device *dev,
1989                                      struct in_device *in_dev,
1990                                      struct sk_buff *skb,
1991                                      __be32 daddr,
1992                                      __be32 saddr)
1993 {
1994         RT_CACHE_STAT_INC(in_martian_src);
1995 #ifdef CONFIG_IP_ROUTE_VERBOSE
1996         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1997                 /*
1998                  *      RFC1812 recommendation, if source is martian,
1999                  *      the only hint is MAC header.
2000                  */
2001                 printk(KERN_WARNING "martian source %pI4 from %pI4, on dev %s\n",
2002                         &daddr, &saddr, dev->name);
2003                 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
2004                         int i;
2005                         const unsigned char *p = skb_mac_header(skb);
2006                         printk(KERN_WARNING "ll header: ");
2007                         for (i = 0; i < dev->hard_header_len; i++, p++) {
2008                                 printk("%02x", *p);
2009                                 if (i < (dev->hard_header_len - 1))
2010                                         printk(":");
2011                         }
2012                         printk("\n");
2013                 }
2014         }
2015 #endif
2016 }
2017
2018 /* called in rcu_read_lock() section */
2019 static int __mkroute_input(struct sk_buff *skb,
2020                            const struct fib_result *res,
2021                            struct in_device *in_dev,
2022                            __be32 daddr, __be32 saddr, u32 tos,
2023                            struct rtable **result)
2024 {
2025         struct rtable *rth;
2026         int err;
2027         struct in_device *out_dev;
2028         unsigned int flags = 0;
2029         __be32 spec_dst;
2030         u32 itag;
2031
2032         /* get a working reference to the output device */
2033         out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
2034         if (out_dev == NULL) {
2035                 if (net_ratelimit())
2036                         printk(KERN_CRIT "Bug in ip_route_input" \
2037                                "_slow(). Please, report\n");
2038                 return -EINVAL;
2039         }
2040
2041
2042         err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
2043                                   in_dev->dev, &spec_dst, &itag);
2044         if (err < 0) {
2045                 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
2046                                          saddr);
2047
2048                 goto cleanup;
2049         }
2050
2051         if (err)
2052                 flags |= RTCF_DIRECTSRC;
2053
2054         if (out_dev == in_dev && err &&
2055             (IN_DEV_SHARED_MEDIA(out_dev) ||
2056              inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
2057                 flags |= RTCF_DOREDIRECT;
2058
2059         if (skb->protocol != htons(ETH_P_IP)) {
2060                 /* Not IP (i.e. ARP). Do not create route, if it is
2061                  * invalid for proxy arp. DNAT routes are always valid.
2062                  *
2063                  * Proxy arp feature have been extended to allow, ARP
2064                  * replies back to the same interface, to support
2065                  * Private VLAN switch technologies. See arp.c.
2066                  */
2067                 if (out_dev == in_dev &&
2068                     IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
2069                         err = -EINVAL;
2070                         goto cleanup;
2071                 }
2072         }
2073
2074         rth = rt_dst_alloc(out_dev->dev,
2075                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
2076                            IN_DEV_CONF_GET(out_dev, NOXFRM));
2077         if (!rth) {
2078                 err = -ENOBUFS;
2079                 goto cleanup;
2080         }
2081
2082         rth->rt_key_dst = daddr;
2083         rth->rt_key_src = saddr;
2084         rth->rt_genid = rt_genid(dev_net(rth->dst.dev));
2085         rth->rt_flags = flags;
2086         rth->rt_type = res->type;
2087         rth->rt_key_tos = tos;
2088         rth->rt_dst     = daddr;
2089         rth->rt_src     = saddr;
2090         rth->rt_route_iif = in_dev->dev->ifindex;
2091         rth->rt_iif     = in_dev->dev->ifindex;
2092         rth->rt_oif     = 0;
2093         rth->rt_mark    = skb->mark;
2094         rth->rt_gateway = daddr;
2095         rth->rt_spec_dst= spec_dst;
2096         rth->rt_peer_genid = 0;
2097         rth->peer = NULL;
2098         rth->fi = NULL;
2099
2100         rth->dst.input = ip_forward;
2101         rth->dst.output = ip_output;
2102
2103         rt_set_nexthop(rth, NULL, res, res->fi, res->type, itag);
2104
2105         *result = rth;
2106         err = 0;
2107  cleanup:
2108         return err;
2109 }
2110
2111 static int ip_mkroute_input(struct sk_buff *skb,
2112                             struct fib_result *res,
2113                             const struct flowi4 *fl4,
2114                             struct in_device *in_dev,
2115                             __be32 daddr, __be32 saddr, u32 tos)
2116 {
2117         struct rtable* rth = NULL;
2118         int err;
2119         unsigned hash;
2120
2121 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2122         if (res->fi && res->fi->fib_nhs > 1)
2123                 fib_select_multipath(res);
2124 #endif
2125
2126         /* create a routing cache entry */
2127         err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
2128         if (err)
2129                 return err;
2130
2131         /* put it into the cache */
2132         hash = rt_hash(daddr, saddr, fl4->flowi4_iif,
2133                        rt_genid(dev_net(rth->dst.dev)));
2134         rth = rt_intern_hash(hash, rth, skb, fl4->flowi4_iif);
2135         if (IS_ERR(rth))
2136                 return PTR_ERR(rth);
2137         return 0;
2138 }
2139
2140 /*
2141  *      NOTE. We drop all the packets that has local source
2142  *      addresses, because every properly looped back packet
2143  *      must have correct destination already attached by output routine.
2144  *
2145  *      Such approach solves two big problems:
2146  *      1. Not simplex devices are handled properly.
2147  *      2. IP spoofing attempts are filtered with 100% of guarantee.
2148  *      called with rcu_read_lock()
2149  */
2150
2151 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2152                                u8 tos, struct net_device *dev)
2153 {
2154         struct fib_result res;
2155         struct in_device *in_dev = __in_dev_get_rcu(dev);
2156         struct flowi4   fl4;
2157         unsigned        flags = 0;
2158         u32             itag = 0;
2159         struct rtable * rth;
2160         unsigned        hash;
2161         __be32          spec_dst;
2162         int             err = -EINVAL;
2163         struct net    * net = dev_net(dev);
2164
2165         /* IP on this device is disabled. */
2166
2167         if (!in_dev)
2168                 goto out;
2169
2170         /* Check for the most weird martians, which can be not detected
2171            by fib_lookup.
2172          */
2173
2174         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
2175             ipv4_is_loopback(saddr))
2176                 goto martian_source;
2177
2178         if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
2179                 goto brd_input;
2180
2181         /* Accept zero addresses only to limited broadcast;
2182          * I even do not know to fix it or not. Waiting for complains :-)
2183          */
2184         if (ipv4_is_zeronet(saddr))
2185                 goto martian_source;
2186
2187         if (ipv4_is_zeronet(daddr) || ipv4_is_loopback(daddr))
2188                 goto martian_destination;
2189
2190         /*
2191          *      Now we are ready to route packet.
2192          */
2193         fl4.flowi4_oif = 0;
2194         fl4.flowi4_iif = dev->ifindex;
2195         fl4.flowi4_mark = skb->mark;
2196         fl4.flowi4_tos = tos;
2197         fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
2198         fl4.daddr = daddr;
2199         fl4.saddr = saddr;
2200         err = fib_lookup(net, &fl4, &res);
2201         if (err != 0) {
2202                 if (!IN_DEV_FORWARD(in_dev))
2203                         goto e_hostunreach;
2204                 goto no_route;
2205         }
2206
2207         RT_CACHE_STAT_INC(in_slow_tot);
2208
2209         if (res.type == RTN_BROADCAST)
2210                 goto brd_input;
2211
2212         if (res.type == RTN_LOCAL) {
2213                 err = fib_validate_source(skb, saddr, daddr, tos,
2214                                           net->loopback_dev->ifindex,
2215                                           dev, &spec_dst, &itag);
2216                 if (err < 0)
2217                         goto martian_source_keep_err;
2218                 if (err)
2219                         flags |= RTCF_DIRECTSRC;
2220                 spec_dst = daddr;
2221                 goto local_input;
2222         }
2223
2224         if (!IN_DEV_FORWARD(in_dev))
2225                 goto e_hostunreach;
2226         if (res.type != RTN_UNICAST)
2227                 goto martian_destination;
2228
2229         err = ip_mkroute_input(skb, &res, &fl4, in_dev, daddr, saddr, tos);
2230 out:    return err;
2231
2232 brd_input:
2233         if (skb->protocol != htons(ETH_P_IP))
2234                 goto e_inval;
2235
2236         if (ipv4_is_zeronet(saddr))
2237                 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
2238         else {
2239                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev, &spec_dst,
2240                                           &itag);
2241                 if (err < 0)
2242                         goto martian_source_keep_err;
2243                 if (err)
2244                         flags |= RTCF_DIRECTSRC;
2245         }
2246         flags |= RTCF_BROADCAST;
2247         res.type = RTN_BROADCAST;
2248         RT_CACHE_STAT_INC(in_brd);
2249
2250 local_input:
2251         rth = rt_dst_alloc(net->loopback_dev,
2252                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
2253         if (!rth)
2254                 goto e_nobufs;
2255
2256         rth->dst.input= ip_local_deliver;
2257         rth->dst.output= ip_rt_bug;
2258 #ifdef CONFIG_IP_ROUTE_CLASSID
2259         rth->dst.tclassid = itag;
2260 #endif
2261
2262         rth->rt_key_dst = daddr;
2263         rth->rt_key_src = saddr;
2264         rth->rt_genid = rt_genid(net);
2265         rth->rt_flags   = flags|RTCF_LOCAL;
2266         rth->rt_type    = res.type;
2267         rth->rt_key_tos = tos;
2268         rth->rt_dst     = daddr;
2269         rth->rt_src     = saddr;
2270 #ifdef CONFIG_IP_ROUTE_CLASSID
2271         rth->dst.tclassid = itag;
2272 #endif
2273         rth->rt_route_iif = dev->ifindex;
2274         rth->rt_iif     = dev->ifindex;
2275         rth->rt_oif     = 0;
2276         rth->rt_mark    = skb->mark;
2277         rth->rt_gateway = daddr;
2278         rth->rt_spec_dst= spec_dst;
2279         rth->rt_peer_genid = 0;
2280         rth->peer = NULL;
2281         rth->fi = NULL;
2282         if (res.type == RTN_UNREACHABLE) {
2283                 rth->dst.input= ip_error;
2284                 rth->dst.error= -err;
2285                 rth->rt_flags   &= ~RTCF_LOCAL;
2286         }
2287         hash = rt_hash(daddr, saddr, fl4.flowi4_iif, rt_genid(net));
2288         rth = rt_intern_hash(hash, rth, skb, fl4.flowi4_iif);
2289         err = 0;
2290         if (IS_ERR(rth))
2291                 err = PTR_ERR(rth);
2292         goto out;
2293
2294 no_route:
2295         RT_CACHE_STAT_INC(in_no_route);
2296         spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
2297         res.type = RTN_UNREACHABLE;
2298         if (err == -ESRCH)
2299                 err = -ENETUNREACH;
2300         goto local_input;
2301
2302         /*
2303          *      Do not cache martian addresses: they should be logged (RFC1812)
2304          */
2305 martian_destination:
2306         RT_CACHE_STAT_INC(in_martian_dst);
2307 #ifdef CONFIG_IP_ROUTE_VERBOSE
2308         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
2309                 printk(KERN_WARNING "martian destination %pI4 from %pI4, dev %s\n",
2310                         &daddr, &saddr, dev->name);
2311 #endif
2312
2313 e_hostunreach:
2314         err = -EHOSTUNREACH;
2315         goto out;
2316
2317 e_inval:
2318         err = -EINVAL;
2319         goto out;
2320
2321 e_nobufs:
2322         err = -ENOBUFS;
2323         goto out;
2324
2325 martian_source:
2326         err = -EINVAL;
2327 martian_source_keep_err:
2328         ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2329         goto out;
2330 }
2331
2332 int ip_route_input_common(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2333                            u8 tos, struct net_device *dev, bool noref)
2334 {
2335         struct rtable * rth;
2336         unsigned        hash;
2337         int iif = dev->ifindex;
2338         struct net *net;
2339         int res;
2340
2341         net = dev_net(dev);
2342
2343         rcu_read_lock();
2344
2345         if (!rt_caching(net))
2346                 goto skip_cache;
2347
2348         tos &= IPTOS_RT_MASK;
2349         hash = rt_hash(daddr, saddr, iif, rt_genid(net));
2350
2351         for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2352              rth = rcu_dereference(rth->dst.rt_next)) {
2353                 if ((((__force u32)rth->rt_key_dst ^ (__force u32)daddr) |
2354                      ((__force u32)rth->rt_key_src ^ (__force u32)saddr) |
2355                      (rth->rt_route_iif ^ iif) |
2356                      (rth->rt_key_tos ^ tos)) == 0 &&
2357                     rth->rt_mark == skb->mark &&
2358                     net_eq(dev_net(rth->dst.dev), net) &&
2359                     !rt_is_expired(rth)) {
2360                         if (noref) {
2361                                 dst_use_noref(&rth->dst, jiffies);
2362                                 skb_dst_set_noref(skb, &rth->dst);
2363                         } else {
2364                                 dst_use(&rth->dst, jiffies);
2365                                 skb_dst_set(skb, &rth->dst);
2366                         }
2367                         RT_CACHE_STAT_INC(in_hit);
2368                         rcu_read_unlock();
2369                         return 0;
2370                 }
2371                 RT_CACHE_STAT_INC(in_hlist_search);
2372         }
2373
2374 skip_cache:
2375         /* Multicast recognition logic is moved from route cache to here.
2376            The problem was that too many Ethernet cards have broken/missing
2377            hardware multicast filters :-( As result the host on multicasting
2378            network acquires a lot of useless route cache entries, sort of
2379            SDR messages from all the world. Now we try to get rid of them.
2380            Really, provided software IP multicast filter is organized
2381            reasonably (at least, hashed), it does not result in a slowdown
2382            comparing with route cache reject entries.
2383            Note, that multicast routers are not affected, because
2384            route cache entry is created eventually.
2385          */
2386         if (ipv4_is_multicast(daddr)) {
2387                 struct in_device *in_dev = __in_dev_get_rcu(dev);
2388
2389                 if (in_dev) {
2390                         int our = ip_check_mc_rcu(in_dev, daddr, saddr,
2391                                                   ip_hdr(skb)->protocol);
2392                         if (our
2393 #ifdef CONFIG_IP_MROUTE
2394                                 ||
2395                             (!ipv4_is_local_multicast(daddr) &&
2396                              IN_DEV_MFORWARD(in_dev))
2397 #endif
2398                            ) {
2399                                 int res = ip_route_input_mc(skb, daddr, saddr,
2400                                                             tos, dev, our);
2401                                 rcu_read_unlock();
2402                                 return res;
2403                         }
2404                 }
2405                 rcu_read_unlock();
2406                 return -EINVAL;
2407         }
2408         res = ip_route_input_slow(skb, daddr, saddr, tos, dev);
2409         rcu_read_unlock();
2410         return res;
2411 }
2412 EXPORT_SYMBOL(ip_route_input_common);
2413
2414 /* called with rcu_read_lock() */
2415 static struct rtable *__mkroute_output(const struct fib_result *res,
2416                                        const struct flowi4 *fl4,
2417                                        __be32 orig_daddr, __be32 orig_saddr,
2418                                        int orig_oif, struct net_device *dev_out,
2419                                        unsigned int flags)
2420 {
2421         struct fib_info *fi = res->fi;
2422         u32 tos = RT_FL_TOS(fl4);
2423         struct in_device *in_dev;
2424         u16 type = res->type;
2425         struct rtable *rth;
2426
2427         if (ipv4_is_loopback(fl4->saddr) && !(dev_out->flags & IFF_LOOPBACK))
2428                 return ERR_PTR(-EINVAL);
2429
2430         if (ipv4_is_lbcast(fl4->daddr))
2431                 type = RTN_BROADCAST;
2432         else if (ipv4_is_multicast(fl4->daddr))
2433                 type = RTN_MULTICAST;
2434         else if (ipv4_is_zeronet(fl4->daddr))
2435                 return ERR_PTR(-EINVAL);
2436
2437         if (dev_out->flags & IFF_LOOPBACK)
2438                 flags |= RTCF_LOCAL;
2439
2440         in_dev = __in_dev_get_rcu(dev_out);
2441         if (!in_dev)
2442                 return ERR_PTR(-EINVAL);
2443
2444         if (type == RTN_BROADCAST) {
2445                 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2446                 fi = NULL;
2447         } else if (type == RTN_MULTICAST) {
2448                 flags |= RTCF_MULTICAST | RTCF_LOCAL;
2449                 if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
2450                                      fl4->flowi4_proto))
2451                         flags &= ~RTCF_LOCAL;
2452                 /* If multicast route do not exist use
2453                  * default one, but do not gateway in this case.
2454                  * Yes, it is hack.
2455                  */
2456                 if (fi && res->prefixlen < 4)
2457                         fi = NULL;
2458         }
2459
2460         rth = rt_dst_alloc(dev_out,
2461                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
2462                            IN_DEV_CONF_GET(in_dev, NOXFRM));
2463         if (!rth)
2464                 return ERR_PTR(-ENOBUFS);
2465
2466         rth->dst.output = ip_output;
2467
2468         rth->rt_key_dst = orig_daddr;
2469         rth->rt_key_src = orig_saddr;
2470         rth->rt_genid = rt_genid(dev_net(dev_out));
2471         rth->rt_flags   = flags;
2472         rth->rt_type    = type;
2473         rth->rt_key_tos = tos;
2474         rth->rt_dst     = fl4->daddr;
2475         rth->rt_src     = fl4->saddr;
2476         rth->rt_route_iif = 0;
2477         rth->rt_iif     = orig_oif ? : dev_out->ifindex;
2478         rth->rt_oif     = orig_oif;
2479         rth->rt_mark    = fl4->flowi4_mark;
2480         rth->rt_gateway = fl4->daddr;
2481         rth->rt_spec_dst= fl4->saddr;
2482         rth->rt_peer_genid = 0;
2483         rth->peer = NULL;
2484         rth->fi = NULL;
2485
2486         RT_CACHE_STAT_INC(out_slow_tot);
2487
2488         if (flags & RTCF_LOCAL) {
2489                 rth->dst.input = ip_local_deliver;
2490                 rth->rt_spec_dst = fl4->daddr;
2491         }
2492         if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2493                 rth->rt_spec_dst = fl4->saddr;
2494                 if (flags & RTCF_LOCAL &&
2495                     !(dev_out->flags & IFF_LOOPBACK)) {
2496                         rth->dst.output = ip_mc_output;
2497                         RT_CACHE_STAT_INC(out_slow_mc);
2498                 }
2499 #ifdef CONFIG_IP_MROUTE
2500                 if (type == RTN_MULTICAST) {
2501                         if (IN_DEV_MFORWARD(in_dev) &&
2502                             !ipv4_is_local_multicast(fl4->daddr)) {
2503                                 rth->dst.input = ip_mr_input;
2504                                 rth->dst.output = ip_mc_output;
2505                         }
2506                 }
2507 #endif
2508         }
2509
2510         rt_set_nexthop(rth, fl4, res, fi, type, 0);
2511
2512         return rth;
2513 }
2514
2515 /*
2516  * Major route resolver routine.
2517  * called with rcu_read_lock();
2518  */
2519
2520 static struct rtable *ip_route_output_slow(struct net *net, struct flowi4 *fl4)
2521 {
2522         struct net_device *dev_out = NULL;
2523         u32 tos = RT_FL_TOS(fl4);
2524         unsigned int flags = 0;
2525         struct fib_result res;
2526         struct rtable *rth;
2527         __be32 orig_daddr;
2528         __be32 orig_saddr;
2529         int orig_oif;
2530
2531         res.fi          = NULL;
2532 #ifdef CONFIG_IP_MULTIPLE_TABLES
2533         res.r           = NULL;
2534 #endif
2535
2536         orig_daddr = fl4->daddr;
2537         orig_saddr = fl4->saddr;
2538         orig_oif = fl4->flowi4_oif;
2539
2540         fl4->flowi4_iif = net->loopback_dev->ifindex;
2541         fl4->flowi4_tos = tos & IPTOS_RT_MASK;
2542         fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
2543                          RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
2544
2545         rcu_read_lock();
2546         if (fl4->saddr) {
2547                 rth = ERR_PTR(-EINVAL);
2548                 if (ipv4_is_multicast(fl4->saddr) ||
2549                     ipv4_is_lbcast(fl4->saddr) ||
2550                     ipv4_is_zeronet(fl4->saddr))
2551                         goto out;
2552
2553                 /* I removed check for oif == dev_out->oif here.
2554                    It was wrong for two reasons:
2555                    1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2556                       is assigned to multiple interfaces.
2557                    2. Moreover, we are allowed to send packets with saddr
2558                       of another iface. --ANK
2559                  */
2560
2561                 if (fl4->flowi4_oif == 0 &&
2562                     (ipv4_is_multicast(fl4->daddr) ||
2563                      ipv4_is_lbcast(fl4->daddr))) {
2564                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2565                         dev_out = __ip_dev_find(net, fl4->saddr, false);
2566                         if (dev_out == NULL)
2567                                 goto out;
2568
2569                         /* Special hack: user can direct multicasts
2570                            and limited broadcast via necessary interface
2571                            without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2572                            This hack is not just for fun, it allows
2573                            vic,vat and friends to work.
2574                            They bind socket to loopback, set ttl to zero
2575                            and expect that it will work.
2576                            From the viewpoint of routing cache they are broken,
2577                            because we are not allowed to build multicast path
2578                            with loopback source addr (look, routing cache
2579                            cannot know, that ttl is zero, so that packet
2580                            will not leave this host and route is valid).
2581                            Luckily, this hack is good workaround.
2582                          */
2583
2584                         fl4->flowi4_oif = dev_out->ifindex;
2585                         goto make_route;
2586                 }
2587
2588                 if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
2589                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2590                         if (!__ip_dev_find(net, fl4->saddr, false))
2591                                 goto out;
2592                 }
2593         }
2594
2595
2596         if (fl4->flowi4_oif) {
2597                 dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
2598                 rth = ERR_PTR(-ENODEV);
2599                 if (dev_out == NULL)
2600                         goto out;
2601
2602                 /* RACE: Check return value of inet_select_addr instead. */
2603                 if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
2604                         rth = ERR_PTR(-ENETUNREACH);
2605                         goto out;
2606                 }
2607                 if (ipv4_is_local_multicast(fl4->daddr) ||
2608                     ipv4_is_lbcast(fl4->daddr)) {
2609                         if (!fl4->saddr)
2610                                 fl4->saddr = inet_select_addr(dev_out, 0,
2611                                                               RT_SCOPE_LINK);
2612                         goto make_route;
2613                 }
2614                 if (fl4->saddr) {
2615                         if (ipv4_is_multicast(fl4->daddr))
2616                                 fl4->saddr = inet_select_addr(dev_out, 0,
2617                                                               fl4->flowi4_scope);
2618                         else if (!fl4->daddr)
2619                                 fl4->saddr = inet_select_addr(dev_out, 0,
2620                                                               RT_SCOPE_HOST);
2621                 }
2622         }
2623
2624         if (!fl4->daddr) {
2625                 fl4->daddr = fl4->saddr;
2626                 if (!fl4->daddr)
2627                         fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
2628                 dev_out = net->loopback_dev;
2629                 fl4->flowi4_oif = net->loopback_dev->ifindex;
2630                 res.type = RTN_LOCAL;
2631                 flags |= RTCF_LOCAL;
2632                 goto make_route;
2633         }
2634
2635         if (fib_lookup(net, fl4, &res)) {
2636                 res.fi = NULL;
2637                 if (fl4->flowi4_oif) {
2638                         /* Apparently, routing tables are wrong. Assume,
2639                            that the destination is on link.
2640
2641                            WHY? DW.
2642                            Because we are allowed to send to iface
2643                            even if it has NO routes and NO assigned
2644                            addresses. When oif is specified, routing
2645                            tables are looked up with only one purpose:
2646                            to catch if destination is gatewayed, rather than
2647                            direct. Moreover, if MSG_DONTROUTE is set,
2648                            we send packet, ignoring both routing tables
2649                            and ifaddr state. --ANK
2650
2651
2652                            We could make it even if oif is unknown,
2653                            likely IPv6, but we do not.
2654                          */
2655
2656                         if (fl4->saddr == 0)
2657                                 fl4->saddr = inet_select_addr(dev_out, 0,
2658                                                               RT_SCOPE_LINK);
2659                         res.type = RTN_UNICAST;
2660                         goto make_route;
2661                 }
2662                 rth = ERR_PTR(-ENETUNREACH);
2663                 goto out;
2664         }
2665
2666         if (res.type == RTN_LOCAL) {
2667                 if (!fl4->saddr) {
2668                         if (res.fi->fib_prefsrc)
2669                                 fl4->saddr = res.fi->fib_prefsrc;
2670                         else
2671                                 fl4->saddr = fl4->daddr;
2672                 }
2673                 dev_out = net->loopback_dev;
2674                 fl4->flowi4_oif = dev_out->ifindex;
2675                 res.fi = NULL;
2676                 flags |= RTCF_LOCAL;
2677                 goto make_route;
2678         }
2679
2680 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2681         if (res.fi->fib_nhs > 1 && fl4->flowi4_oif == 0)
2682                 fib_select_multipath(&res);
2683         else
2684 #endif
2685         if (!res.prefixlen &&
2686             res.table->tb_num_default > 1 &&
2687             res.type == RTN_UNICAST && !fl4->flowi4_oif)
2688                 fib_select_default(&res);
2689
2690         if (!fl4->saddr)
2691                 fl4->saddr = FIB_RES_PREFSRC(net, res);
2692
2693         dev_out = FIB_RES_DEV(res);
2694         fl4->flowi4_oif = dev_out->ifindex;
2695
2696
2697 make_route:
2698         rth = __mkroute_output(&res, fl4, orig_daddr, orig_saddr, orig_oif,
2699                                dev_out, flags);
2700         if (!IS_ERR(rth)) {
2701                 unsigned int hash;
2702
2703                 hash = rt_hash(orig_daddr, orig_saddr, orig_oif,
2704                                rt_genid(dev_net(dev_out)));
2705                 rth = rt_intern_hash(hash, rth, NULL, orig_oif);
2706         }
2707
2708 out:
2709         rcu_read_unlock();
2710         return rth;
2711 }
2712
2713 struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *flp4)
2714 {
2715         struct rtable *rth;
2716         unsigned int hash;
2717
2718         if (!rt_caching(net))
2719                 goto slow_output;
2720
2721         hash = rt_hash(flp4->daddr, flp4->saddr, flp4->flowi4_oif, rt_genid(net));
2722
2723         rcu_read_lock_bh();
2724         for (rth = rcu_dereference_bh(rt_hash_table[hash].chain); rth;
2725                 rth = rcu_dereference_bh(rth->dst.rt_next)) {
2726                 if (rth->rt_key_dst == flp4->daddr &&
2727                     rth->rt_key_src == flp4->saddr &&
2728                     rt_is_output_route(rth) &&
2729                     rth->rt_oif == flp4->flowi4_oif &&
2730                     rth->rt_mark == flp4->flowi4_mark &&
2731                     !((rth->rt_key_tos ^ flp4->flowi4_tos) &
2732                             (IPTOS_RT_MASK | RTO_ONLINK)) &&
2733                     net_eq(dev_net(rth->dst.dev), net) &&
2734                     !rt_is_expired(rth)) {
2735                         dst_use(&rth->dst, jiffies);
2736                         RT_CACHE_STAT_INC(out_hit);
2737                         rcu_read_unlock_bh();
2738                         if (!flp4->saddr)
2739                                 flp4->saddr = rth->rt_src;
2740                         if (!flp4->daddr)
2741                                 flp4->daddr = rth->rt_dst;
2742                         return rth;
2743                 }
2744                 RT_CACHE_STAT_INC(out_hlist_search);
2745         }
2746         rcu_read_unlock_bh();
2747
2748 slow_output:
2749         return ip_route_output_slow(net, flp4);
2750 }
2751 EXPORT_SYMBOL_GPL(__ip_route_output_key);
2752
2753 static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2754 {
2755         return NULL;
2756 }
2757
2758 static unsigned int ipv4_blackhole_default_mtu(const struct dst_entry *dst)
2759 {
2760         return 0;
2761 }
2762
2763 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
2764 {
2765 }
2766
2767 static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2768                                           unsigned long old)
2769 {
2770         return NULL;
2771 }
2772
2773 static struct dst_ops ipv4_dst_blackhole_ops = {
2774         .family                 =       AF_INET,
2775         .protocol               =       cpu_to_be16(ETH_P_IP),
2776         .destroy                =       ipv4_dst_destroy,
2777         .check                  =       ipv4_blackhole_dst_check,
2778         .default_mtu            =       ipv4_blackhole_default_mtu,
2779         .default_advmss         =       ipv4_default_advmss,
2780         .update_pmtu            =       ipv4_rt_blackhole_update_pmtu,
2781         .cow_metrics            =       ipv4_rt_blackhole_cow_metrics,
2782         .neigh_lookup           =       ipv4_neigh_lookup,
2783 };
2784
2785 struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2786 {
2787         struct rtable *rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, 0, 0);
2788         struct rtable *ort = (struct rtable *) dst_orig;
2789
2790         if (rt) {
2791                 struct dst_entry *new = &rt->dst;
2792
2793                 new->__use = 1;
2794                 new->input = dst_discard;
2795                 new->output = dst_discard;
2796                 dst_copy_metrics(new, &ort->dst);
2797
2798                 new->dev = ort->dst.dev;
2799                 if (new->dev)
2800                         dev_hold(new->dev);
2801
2802                 rt->rt_key_dst = ort->rt_key_dst;
2803                 rt->rt_key_src = ort->rt_key_src;
2804                 rt->rt_key_tos = ort->rt_key_tos;
2805                 rt->rt_route_iif = ort->rt_route_iif;
2806                 rt->rt_iif = ort->rt_iif;
2807                 rt->rt_oif = ort->rt_oif;
2808                 rt->rt_mark = ort->rt_mark;
2809
2810                 rt->rt_genid = rt_genid(net);
2811                 rt->rt_flags = ort->rt_flags;
2812                 rt->rt_type = ort->rt_type;
2813                 rt->rt_dst = ort->rt_dst;
2814                 rt->rt_src = ort->rt_src;
2815                 rt->rt_gateway = ort->rt_gateway;
2816                 rt->rt_spec_dst = ort->rt_spec_dst;
2817                 rt->peer = ort->peer;
2818                 if (rt->peer)
2819                         atomic_inc(&rt->peer->refcnt);
2820                 rt->fi = ort->fi;
2821                 if (rt->fi)
2822                         atomic_inc(&rt->fi->fib_clntref);
2823
2824                 dst_free(new);
2825         }
2826
2827         dst_release(dst_orig);
2828
2829         return rt ? &rt->dst : ERR_PTR(-ENOMEM);
2830 }
2831
2832 struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2833                                     struct sock *sk)
2834 {
2835         struct rtable *rt = __ip_route_output_key(net, flp4);
2836
2837         if (IS_ERR(rt))
2838                 return rt;
2839
2840         if (flp4->flowi4_proto)
2841                 rt = (struct rtable *) xfrm_lookup(net, &rt->dst,
2842                                                    flowi4_to_flowi(flp4),
2843                                                    sk, 0);
2844
2845         return rt;
2846 }
2847 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2848
2849 static int rt_fill_info(struct net *net,
2850                         struct sk_buff *skb, u32 pid, u32 seq, int event,
2851                         int nowait, unsigned int flags)
2852 {
2853         struct rtable *rt = skb_rtable(skb);
2854         struct rtmsg *r;
2855         struct nlmsghdr *nlh;
2856         unsigned long expires = 0;
2857         const struct inet_peer *peer = rt->peer;
2858         u32 id = 0, ts = 0, tsage = 0, error;
2859
2860         nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
2861         if (nlh == NULL)
2862                 return -EMSGSIZE;
2863
2864         r = nlmsg_data(nlh);
2865         r->rtm_family    = AF_INET;
2866         r->rtm_dst_len  = 32;
2867         r->rtm_src_len  = 0;
2868         r->rtm_tos      = rt->rt_key_tos;
2869         r->rtm_table    = RT_TABLE_MAIN;
2870         NLA_PUT_U32(skb, RTA_TABLE, RT_TABLE_MAIN);
2871         r->rtm_type     = rt->rt_type;
2872         r->rtm_scope    = RT_SCOPE_UNIVERSE;
2873         r->rtm_protocol = RTPROT_UNSPEC;
2874         r->rtm_flags    = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2875         if (rt->rt_flags & RTCF_NOTIFY)
2876                 r->rtm_flags |= RTM_F_NOTIFY;
2877
2878         NLA_PUT_BE32(skb, RTA_DST, rt->rt_dst);
2879
2880         if (rt->rt_key_src) {
2881                 r->rtm_src_len = 32;
2882                 NLA_PUT_BE32(skb, RTA_SRC, rt->rt_key_src);
2883         }
2884         if (rt->dst.dev)
2885                 NLA_PUT_U32(skb, RTA_OIF, rt->dst.dev->ifindex);
2886 #ifdef CONFIG_IP_ROUTE_CLASSID
2887         if (rt->dst.tclassid)
2888                 NLA_PUT_U32(skb, RTA_FLOW, rt->dst.tclassid);
2889 #endif
2890         if (rt_is_input_route(rt))
2891                 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_spec_dst);
2892         else if (rt->rt_src != rt->rt_key_src)
2893                 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_src);
2894
2895         if (rt->rt_dst != rt->rt_gateway)
2896                 NLA_PUT_BE32(skb, RTA_GATEWAY, rt->rt_gateway);
2897
2898         if (rtnetlink_put_metrics(skb, dst_metrics_ptr(&rt->dst)) < 0)
2899                 goto nla_put_failure;
2900
2901         if (rt->rt_mark)
2902                 NLA_PUT_BE32(skb, RTA_MARK, rt->rt_mark);
2903
2904         error = rt->dst.error;
2905         if (peer) {
2906                 inet_peer_refcheck(rt->peer);
2907                 id = atomic_read(&peer->ip_id_count) & 0xffff;
2908                 if (peer->tcp_ts_stamp) {
2909                         ts = peer->tcp_ts;
2910                         tsage = get_seconds() - peer->tcp_ts_stamp;
2911                 }
2912                 expires = ACCESS_ONCE(peer->pmtu_expires);
2913                 if (expires) {
2914                         if (time_before(jiffies, expires))
2915                                 expires -= jiffies;
2916                         else
2917                                 expires = 0;
2918                 }
2919         }
2920
2921         if (rt_is_input_route(rt)) {
2922 #ifdef CONFIG_IP_MROUTE
2923                 __be32 dst = rt->rt_dst;
2924
2925                 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
2926                     IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2927                         int err = ipmr_get_route(net, skb,
2928                                                  rt->rt_src, rt->rt_dst,
2929                                                  r, nowait);
2930                         if (err <= 0) {
2931                                 if (!nowait) {
2932                                         if (err == 0)
2933                                                 return 0;
2934                                         goto nla_put_failure;
2935                                 } else {
2936                                         if (err == -EMSGSIZE)
2937                                                 goto nla_put_failure;
2938                                         error = err;
2939                                 }
2940                         }
2941                 } else
2942 #endif
2943                         NLA_PUT_U32(skb, RTA_IIF, rt->rt_iif);
2944         }
2945
2946         if (rtnl_put_cacheinfo(skb, &rt->dst, id, ts, tsage,
2947                                expires, error) < 0)
2948                 goto nla_put_failure;
2949
2950         return nlmsg_end(skb, nlh);
2951
2952 nla_put_failure:
2953         nlmsg_cancel(skb, nlh);
2954         return -EMSGSIZE;
2955 }
2956
2957 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2958 {
2959         struct net *net = sock_net(in_skb->sk);
2960         struct rtmsg *rtm;
2961         struct nlattr *tb[RTA_MAX+1];
2962         struct rtable *rt = NULL;
2963         __be32 dst = 0;
2964         __be32 src = 0;
2965         u32 iif;
2966         int err;
2967         int mark;
2968         struct sk_buff *skb;
2969
2970         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
2971         if (err < 0)
2972                 goto errout;
2973
2974         rtm = nlmsg_data(nlh);
2975
2976         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2977         if (skb == NULL) {
2978                 err = -ENOBUFS;
2979                 goto errout;
2980         }
2981
2982         /* Reserve room for dummy headers, this skb can pass
2983            through good chunk of routing engine.
2984          */
2985         skb_reset_mac_header(skb);
2986         skb_reset_network_header(skb);
2987
2988         /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
2989         ip_hdr(skb)->protocol = IPPROTO_ICMP;
2990         skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2991
2992         src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
2993         dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
2994         iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
2995         mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
2996
2997         if (iif) {
2998                 struct net_device *dev;
2999
3000                 dev = __dev_get_by_index(net, iif);
3001                 if (dev == NULL) {
3002                         err = -ENODEV;
3003                         goto errout_free;
3004                 }
3005
3006                 skb->protocol   = htons(ETH_P_IP);
3007                 skb->dev        = dev;
3008                 skb->mark       = mark;
3009                 local_bh_disable();
3010                 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
3011                 local_bh_enable();
3012
3013                 rt = skb_rtable(skb);
3014                 if (err == 0 && rt->dst.error)
3015                         err = -rt->dst.error;
3016         } else {
3017                 struct flowi4 fl4 = {
3018                         .daddr = dst,
3019                         .saddr = src,
3020                         .flowi4_tos = rtm->rtm_tos,
3021                         .flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
3022                         .flowi4_mark = mark,
3023                 };
3024                 rt = ip_route_output_key(net, &fl4);
3025
3026                 err = 0;
3027                 if (IS_ERR(rt))
3028                         err = PTR_ERR(rt);
3029         }
3030
3031         if (err)
3032                 goto errout_free;
3033
3034         skb_dst_set(skb, &rt->dst);
3035         if (rtm->rtm_flags & RTM_F_NOTIFY)
3036                 rt->rt_flags |= RTCF_NOTIFY;
3037
3038         err = rt_fill_info(net, skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
3039                            RTM_NEWROUTE, 0, 0);
3040         if (err <= 0)
3041                 goto errout_free;
3042
3043         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
3044 errout:
3045         return err;
3046
3047 errout_free:
3048         kfree_skb(skb);
3049         goto errout;
3050 }
3051
3052 int ip_rt_dump(struct sk_buff *skb,  struct netlink_callback *cb)
3053 {
3054         struct rtable *rt;
3055         int h, s_h;
3056         int idx, s_idx;
3057         struct net *net;
3058
3059         net = sock_net(skb->sk);
3060
3061         s_h = cb->args[0];
3062         if (s_h < 0)
3063                 s_h = 0;
3064         s_idx = idx = cb->args[1];
3065         for (h = s_h; h <= rt_hash_mask; h++, s_idx = 0) {
3066                 if (!rt_hash_table[h].chain)
3067                         continue;
3068                 rcu_read_lock_bh();
3069                 for (rt = rcu_dereference_bh(rt_hash_table[h].chain), idx = 0; rt;
3070                      rt = rcu_dereference_bh(rt->dst.rt_next), idx++) {
3071                         if (!net_eq(dev_net(rt->dst.dev), net) || idx < s_idx)
3072                                 continue;
3073                         if (rt_is_expired(rt))
3074                                 continue;
3075                         skb_dst_set_noref(skb, &rt->dst);
3076                         if (rt_fill_info(net, skb, NETLINK_CB(cb->skb).pid,
3077                                          cb->nlh->nlmsg_seq, RTM_NEWROUTE,
3078                                          1, NLM_F_MULTI) <= 0) {
3079                                 skb_dst_drop(skb);
3080                                 rcu_read_unlock_bh();
3081                                 goto done;
3082                         }
3083                         skb_dst_drop(skb);
3084                 }
3085                 rcu_read_unlock_bh();
3086         }
3087
3088 done:
3089         cb->args[0] = h;
3090         cb->args[1] = idx;
3091         return skb->len;
3092 }
3093
3094 void ip_rt_multicast_event(struct in_device *in_dev)
3095 {
3096         rt_cache_flush(dev_net(in_dev->dev), 0);
3097 }
3098
3099 #ifdef CONFIG_SYSCTL
3100 static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write,
3101                                         void __user *buffer,
3102                                         size_t *lenp, loff_t *ppos)
3103 {
3104         if (write) {
3105                 int flush_delay;
3106                 ctl_table ctl;
3107                 struct net *net;
3108
3109                 memcpy(&ctl, __ctl, sizeof(ctl));
3110                 ctl.data = &flush_delay;
3111                 proc_dointvec(&ctl, write, buffer, lenp, ppos);
3112
3113                 net = (struct net *)__ctl->extra1;
3114                 rt_cache_flush(net, flush_delay);
3115                 return 0;
3116         }
3117
3118         return -EINVAL;
3119 }
3120
3121 static ctl_table ipv4_route_table[] = {
3122         {
3123                 .procname       = "gc_thresh",
3124                 .data           = &ipv4_dst_ops.gc_thresh,
3125                 .maxlen         = sizeof(int),
3126                 .mode           = 0644,
3127                 .proc_handler   = proc_dointvec,
3128         },
3129         {
3130                 .procname       = "max_size",
3131                 .data           = &ip_rt_max_size,
3132                 .maxlen         = sizeof(int),
3133                 .mode           = 0644,
3134                 .proc_handler   = proc_dointvec,
3135         },
3136         {
3137                 /*  Deprecated. Use gc_min_interval_ms */
3138
3139                 .procname       = "gc_min_interval",
3140                 .data           = &ip_rt_gc_min_interval,
3141                 .maxlen         = sizeof(int),
3142                 .mode           = 0644,
3143                 .proc_handler   = proc_dointvec_jiffies,
3144         },
3145         {
3146                 .procname       = "gc_min_interval_ms",
3147                 .data           = &ip_rt_gc_min_interval,
3148                 .maxlen         = sizeof(int),
3149                 .mode           = 0644,
3150                 .proc_handler   = proc_dointvec_ms_jiffies,
3151         },
3152         {
3153                 .procname       = "gc_timeout",
3154                 .data           = &ip_rt_gc_timeout,
3155                 .maxlen         = sizeof(int),
3156                 .mode           = 0644,
3157                 .proc_handler   = proc_dointvec_jiffies,
3158         },
3159         {
3160                 .procname       = "redirect_load",
3161                 .data           = &ip_rt_redirect_load,
3162                 .maxlen         = sizeof(int),
3163                 .mode           = 0644,
3164                 .proc_handler   = proc_dointvec,
3165         },
3166         {
3167                 .procname       = "redirect_number",
3168                 .data           = &ip_rt_redirect_number,
3169                 .maxlen         = sizeof(int),
3170                 .mode           = 0644,
3171                 .proc_handler   = proc_dointvec,
3172         },
3173         {
3174                 .procname       = "redirect_silence",
3175                 .data           = &ip_rt_redirect_silence,
3176                 .maxlen         = sizeof(int),
3177                 .mode           = 0644,
3178                 .proc_handler   = proc_dointvec,
3179         },
3180         {
3181                 .procname       = "error_cost",
3182                 .data           = &ip_rt_error_cost,
3183                 .maxlen         = sizeof(int),
3184                 .mode           = 0644,
3185                 .proc_handler   = proc_dointvec,
3186         },
3187         {
3188                 .procname       = "error_burst",
3189                 .data           = &ip_rt_error_burst,
3190                 .maxlen         = sizeof(int),
3191                 .mode           = 0644,
3192                 .proc_handler   = proc_dointvec,
3193         },
3194         {
3195                 .procname       = "gc_elasticity",
3196                 .data           = &ip_rt_gc_elasticity,
3197                 .maxlen         = sizeof(int),
3198                 .mode           = 0644,
3199                 .proc_handler   = proc_dointvec,
3200         },
3201         {
3202                 .procname       = "mtu_expires",
3203                 .data           = &ip_rt_mtu_expires,
3204                 .maxlen         = sizeof(int),
3205                 .mode           = 0644,
3206                 .proc_handler   = proc_dointvec_jiffies,
3207         },
3208         {
3209                 .procname       = "min_pmtu",
3210                 .data           = &ip_rt_min_pmtu,
3211                 .maxlen         = sizeof(int),
3212                 .mode           = 0644,
3213                 .proc_handler   = proc_dointvec,
3214         },
3215         {
3216                 .procname       = "min_adv_mss",
3217                 .data           = &ip_rt_min_advmss,
3218                 .maxlen         = sizeof(int),
3219                 .mode           = 0644,
3220                 .proc_handler   = proc_dointvec,
3221         },
3222         { }
3223 };
3224
3225 static struct ctl_table empty[1];
3226
3227 static struct ctl_table ipv4_skeleton[] =
3228 {
3229         { .procname = "route",
3230           .mode = 0555, .child = ipv4_route_table},
3231         { .procname = "neigh",
3232           .mode = 0555, .child = empty},
3233         { }
3234 };
3235
3236 static __net_initdata struct ctl_path ipv4_path[] = {
3237         { .procname = "net", },
3238         { .procname = "ipv4", },
3239         { },
3240 };
3241
3242 static struct ctl_table ipv4_route_flush_table[] = {
3243         {
3244                 .procname       = "flush",
3245                 .maxlen         = sizeof(int),
3246                 .mode           = 0200,
3247                 .proc_handler   = ipv4_sysctl_rtcache_flush,
3248         },
3249         { },
3250 };
3251
3252 static __net_initdata struct ctl_path ipv4_route_path[] = {
3253         { .procname = "net", },
3254         { .procname = "ipv4", },
3255         { .procname = "route", },
3256         { },
3257 };
3258
3259 static __net_init int sysctl_route_net_init(struct net *net)
3260 {
3261         struct ctl_table *tbl;
3262
3263         tbl = ipv4_route_flush_table;
3264         if (!net_eq(net, &init_net)) {
3265                 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
3266                 if (tbl == NULL)
3267                         goto err_dup;
3268         }
3269         tbl[0].extra1 = net;
3270
3271         net->ipv4.route_hdr =
3272                 register_net_sysctl_table(net, ipv4_route_path, tbl);
3273         if (net->ipv4.route_hdr == NULL)
3274                 goto err_reg;
3275         return 0;
3276
3277 err_reg:
3278         if (tbl != ipv4_route_flush_table)
3279                 kfree(tbl);
3280 err_dup:
3281         return -ENOMEM;
3282 }
3283
3284 static __net_exit void sysctl_route_net_exit(struct net *net)
3285 {
3286         struct ctl_table *tbl;
3287
3288         tbl = net->ipv4.route_hdr->ctl_table_arg;
3289         unregister_net_sysctl_table(net->ipv4.route_hdr);
3290         BUG_ON(tbl == ipv4_route_flush_table);
3291         kfree(tbl);
3292 }
3293
3294 static __net_initdata struct pernet_operations sysctl_route_ops = {
3295         .init = sysctl_route_net_init,
3296         .exit = sysctl_route_net_exit,
3297 };
3298 #endif
3299
3300 static __net_init int rt_genid_init(struct net *net)
3301 {
3302         get_random_bytes(&net->ipv4.rt_genid,
3303                          sizeof(net->ipv4.rt_genid));
3304         get_random_bytes(&net->ipv4.dev_addr_genid,
3305                          sizeof(net->ipv4.dev_addr_genid));
3306         return 0;
3307 }
3308
3309 static __net_initdata struct pernet_operations rt_genid_ops = {
3310         .init = rt_genid_init,
3311 };
3312
3313
3314 #ifdef CONFIG_IP_ROUTE_CLASSID
3315 struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
3316 #endif /* CONFIG_IP_ROUTE_CLASSID */
3317
3318 static __initdata unsigned long rhash_entries;
3319 static int __init set_rhash_entries(char *str)
3320 {
3321         if (!str)
3322                 return 0;
3323         rhash_entries = simple_strtoul(str, &str, 0);
3324         return 1;
3325 }
3326 __setup("rhash_entries=", set_rhash_entries);
3327
3328 int __init ip_rt_init(void)
3329 {
3330         int rc = 0;
3331
3332 #ifdef CONFIG_IP_ROUTE_CLASSID
3333         ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
3334         if (!ip_rt_acct)
3335                 panic("IP: failed to allocate ip_rt_acct\n");
3336 #endif
3337
3338         ipv4_dst_ops.kmem_cachep =
3339                 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3340                                   SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3341
3342         ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3343
3344         if (dst_entries_init(&ipv4_dst_ops) < 0)
3345                 panic("IP: failed to allocate ipv4_dst_ops counter\n");
3346
3347         if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
3348                 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
3349
3350         rt_hash_table = (struct rt_hash_bucket *)
3351                 alloc_large_system_hash("IP route cache",
3352                                         sizeof(struct rt_hash_bucket),
3353                                         rhash_entries,
3354                                         (totalram_pages >= 128 * 1024) ?
3355                                         15 : 17,
3356                                         0,
3357                                         &rt_hash_log,
3358                                         &rt_hash_mask,
3359                                         rhash_entries ? 0 : 512 * 1024);
3360         memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
3361         rt_hash_lock_init();
3362
3363         ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
3364         ip_rt_max_size = (rt_hash_mask + 1) * 16;
3365
3366         devinet_init();
3367         ip_fib_init();
3368
3369         if (ip_rt_proc_init())
3370                 printk(KERN_ERR "Unable to create route proc files\n");
3371 #ifdef CONFIG_XFRM
3372         xfrm_init();
3373         xfrm4_init(ip_rt_max_size);
3374 #endif
3375         rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL, NULL);
3376
3377 #ifdef CONFIG_SYSCTL
3378         register_pernet_subsys(&sysctl_route_ops);
3379 #endif
3380         register_pernet_subsys(&rt_genid_ops);
3381         return rc;
3382 }
3383
3384 #ifdef CONFIG_SYSCTL
3385 /*
3386  * We really need to sanitize the damn ipv4 init order, then all
3387  * this nonsense will go away.
3388  */
3389 void __init ip_static_sysctl_init(void)
3390 {
3391         register_sysctl_paths(ipv4_path, ipv4_skeleton);
3392 }
3393 #endif