net/ipv4/route.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              ROUTE - implementation of the IP router.
   7  *
   8  * Authors:     Ross Biro
   9  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  10  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
  11  *              Linus Torvalds, <Linus.Torvalds@helsinki.fi>
  12  *              Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
  13  *
  14  * Fixes:
  15  *              Alan Cox        :       Verify area fixes.
  16  *              Alan Cox        :       cli() protects routing changes
  17  *              Rui Oliveira    :       ICMP routing table updates
  18  *              (rco@di.uminho.pt)      Routing table insertion and update
  19  *              Linus Torvalds  :       Rewrote bits to be sensible
  20  *              Alan Cox        :       Added BSD route gw semantics
  21  *              Alan Cox        :       Super /proc >4K
  22  *              Alan Cox        :       MTU in route table
  23  *              Alan Cox        :       MSS actually. Also added the window
  24  *                                      clamper.
  25  *              Sam Lantinga    :       Fixed route matching in rt_del()
  26  *              Alan Cox        :       Routing cache support.
  27  *              Alan Cox        :       Removed compatibility cruft.
  28  *              Alan Cox        :       RTF_REJECT support.
  29  *              Alan Cox        :       TCP irtt support.
  30  *              Jonathan Naylor :       Added Metric support.
  31  *      Miquel van Smoorenburg  :       BSD API fixes.
  32  *      Miquel van Smoorenburg  :       Metrics.
  33  *              Alan Cox        :       Use __u32 properly
  34  *              Alan Cox        :       Aligned routing errors more closely with BSD
  35  *                                      our system is still very different.
  36  *              Alan Cox        :       Faster /proc handling
  37  *      Alexey Kuznetsov        :       Massive rework to support tree based routing,
  38  *                                      routing caches and better behaviour.
  39  *
  40  *              Olaf Erb        :       irtt wasn't being copied right.
  41  *              Bjorn Ekwall    :       Kerneld route support.
  42  *              Alan Cox        :       Multicast fixed (I hope)
  43  *              Pavel Krauz     :       Limited broadcast fixed
  44  *              Mike McLagan    :       Routing by source
  45  *      Alexey Kuznetsov        :       End of old history. Split to fib.c and
  46  *                                      route.c and rewritten from scratch.
  47  *              Andi Kleen      :       Load-limit warning messages.
  48  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
  49  *      Vitaly E. Lavrov        :       Race condition in ip_route_input_slow.
  50  *      Tobias Ringstrom        :       Uninitialized res.type in ip_route_output_slow.
  51  *      Vladimir V. Ivanov      :       IP rule info (flowid) is really useful.
  52  *              Marc Boucher    :       routing by fwmark
  53  *      Robert Olsson           :       Added rt_cache statistics
  54  *      Arnaldo C. Melo         :       Convert proc stuff to seq_file
  55  *      Eric Dumazet            :       hashed spinlocks and rt_check_expire() fixes.
  56  *      Ilia Sotnikov           :       Ignore TOS on PMTUD and Redirect
  57  *      Ilia Sotnikov           :       Removed TOS from hash calculations
  58  *
  59  *              This program is free software; you can redistribute it and/or
  60  *              modify it under the terms of the GNU General Public License
  61  *              as published by the Free Software Foundation; either version
  62  *              2 of the License, or (at your option) any later version.
  63  */
  64
  65 #include <linux/module.h>
  66 #include <asm/uaccess.h>
  67 #include <asm/system.h>
  68 #include <linux/bitops.h>
  69 #include <linux/types.h>
  70 #include <linux/kernel.h>
  71 #include <linux/mm.h>
  72 #include <linux/bootmem.h>
  73 #include <linux/string.h>
  74 #include <linux/socket.h>
  75 #include <linux/sockios.h>
  76 #include <linux/errno.h>
  77 #include <linux/in.h>
  78 #include <linux/inet.h>
  79 #include <linux/netdevice.h>
  80 #include <linux/proc_fs.h>
  81 #include <linux/init.h>
  82 #include <linux/workqueue.h>
  83 #include <linux/skbuff.h>
  84 #include <linux/inetdevice.h>
  85 #include <linux/igmp.h>
  86 #include <linux/pkt_sched.h>
  87 #include <linux/mroute.h>
  88 #include <linux/netfilter_ipv4.h>
  89 #include <linux/random.h>
  90 #include <linux/jhash.h>
  91 #include <linux/rcupdate.h>
  92 #include <linux/times.h>
  93 #include <net/dst.h>
  94 #include <net/net_namespace.h>
  95 #include <net/protocol.h>
  96 #include <net/ip.h>
  97 #include <net/route.h>
  98 #include <net/inetpeer.h>
  99 #include <net/sock.h>
 100 #include <net/ip_fib.h>
 101 #include <net/arp.h>
 102 #include <net/tcp.h>
 103 #include <net/icmp.h>
 104 #include <net/xfrm.h>
 105 #include <net/netevent.h>
 106 #include <net/rtnetlink.h>
 107 #ifdef CONFIG_SYSCTL
 108 #include <linux/sysctl.h>
 109 #endif
 110
 111 #define RT_FL_TOS(oldflp) \
 112     ((u32)(oldflp->fl4_tos & (IPTOS_RT_MASK | RTO_ONLINK)))
 113
 114 #define IP_MAX_MTU      0xFFF0
 115
 116 #define RT_GC_TIMEOUT (300*HZ)
 117
 118 static int ip_rt_max_size;
 119 static int ip_rt_gc_timeout __read_mostly       = RT_GC_TIMEOUT;
 120 static int ip_rt_gc_interval __read_mostly      = 60 * HZ;
 121 static int ip_rt_gc_min_interval __read_mostly  = HZ / 2;
 122 static int ip_rt_redirect_number __read_mostly  = 9;
 123 static int ip_rt_redirect_load __read_mostly    = HZ / 50;
 124 static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
 125 static int ip_rt_error_cost __read_mostly       = HZ;
 126 static int ip_rt_error_burst __read_mostly      = 5 * HZ;
 127 static int ip_rt_gc_elasticity __read_mostly    = 8;
 128 static int ip_rt_mtu_expires __read_mostly      = 10 * 60 * HZ;
 129 static int ip_rt_min_pmtu __read_mostly         = 512 + 20 + 20;
 130 static int ip_rt_min_advmss __read_mostly       = 256;
 131 static int ip_rt_secret_interval __read_mostly  = 10 * 60 * HZ;
 132 static int rt_chain_length_max __read_mostly    = 20;
 133
 134 static struct delayed_work expires_work;
 135 static unsigned long expires_ljiffies;
 136
 137 /*
 138  *      Interface to generic destination cache.
 139  */
 140
 141 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
 142 static void              ipv4_dst_destroy(struct dst_entry *dst);
 143 static void              ipv4_dst_ifdown(struct dst_entry *dst,
 144                                          struct net_device *dev, int how);
 145 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
 146 static void              ipv4_link_failure(struct sk_buff *skb);
 147 static void              ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
 148 static int rt_garbage_collect(struct dst_ops *ops);
 149
 150
 151 static struct dst_ops ipv4_dst_ops = {
 152         .family =               AF_INET,
 153         .protocol =             cpu_to_be16(ETH_P_IP),
 154         .gc =                   rt_garbage_collect,
 155         .check =                ipv4_dst_check,
 156         .destroy =              ipv4_dst_destroy,
 157         .ifdown =               ipv4_dst_ifdown,
 158         .negative_advice =      ipv4_negative_advice,
 159         .link_failure =         ipv4_link_failure,
 160         .update_pmtu =          ip_rt_update_pmtu,
 161         .local_out =            __ip_local_out,
 162         .entries =              ATOMIC_INIT(0),
 163 };
 164
 165 #define ECN_OR_COST(class)      TC_PRIO_##class
 166
 167 const __u8 ip_tos2prio[16] = {
 168         TC_PRIO_BESTEFFORT,
 169         ECN_OR_COST(FILLER),
 170         TC_PRIO_BESTEFFORT,
 171         ECN_OR_COST(BESTEFFORT),
 172         TC_PRIO_BULK,
 173         ECN_OR_COST(BULK),
 174         TC_PRIO_BULK,
 175         ECN_OR_COST(BULK),
 176         TC_PRIO_INTERACTIVE,
 177         ECN_OR_COST(INTERACTIVE),
 178         TC_PRIO_INTERACTIVE,
 179         ECN_OR_COST(INTERACTIVE),
 180         TC_PRIO_INTERACTIVE_BULK,
 181         ECN_OR_COST(INTERACTIVE_BULK),
 182         TC_PRIO_INTERACTIVE_BULK,
 183         ECN_OR_COST(INTERACTIVE_BULK)
 184 };
 185
 186
 187 /*
 188  * Route cache.
 189  */
 190
 191 /* The locking scheme is rather straight forward:
 192  *
 193  * 1) Read-Copy Update protects the buckets of the central route hash.
 194  * 2) Only writers remove entries, and they hold the lock
 195  *    as they look at rtable reference counts.
 196  * 3) Only readers acquire references to rtable entries,
 197  *    they do so with atomic increments and with the
 198  *    lock held.
 199  */
 200
 201 struct rt_hash_bucket {
 202         struct rtable   *chain;
 203 };
 204
 205 #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
 206         defined(CONFIG_PROVE_LOCKING)
 207 /*
 208  * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
 209  * The size of this table is a power of two and depends on the number of CPUS.
 210  * (on lockdep we have a quite big spinlock_t, so keep the size down there)
 211  */
 212 #ifdef CONFIG_LOCKDEP
 213 # define RT_HASH_LOCK_SZ        256
 214 #else
 215 # if NR_CPUS >= 32
 216 #  define RT_HASH_LOCK_SZ       4096
 217 # elif NR_CPUS >= 16
 218 #  define RT_HASH_LOCK_SZ       2048
 219 # elif NR_CPUS >= 8
 220 #  define RT_HASH_LOCK_SZ       1024
 221 # elif NR_CPUS >= 4
 222 #  define RT_HASH_LOCK_SZ       512
 223 # else
 224 #  define RT_HASH_LOCK_SZ       256
 225 # endif
 226 #endif
 227
 228 static spinlock_t       *rt_hash_locks;
 229 # define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
 230
 231 static __init void rt_hash_lock_init(void)
 232 {
 233         int i;
 234
 235         rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ,
 236                         GFP_KERNEL);
 237         if (!rt_hash_locks)
 238                 panic("IP: failed to allocate rt_hash_locks\n");
 239
 240         for (i = 0; i < RT_HASH_LOCK_SZ; i++)
 241                 spin_lock_init(&rt_hash_locks[i]);
 242 }
 243 #else
 244 # define rt_hash_lock_addr(slot) NULL
 245
 246 static inline void rt_hash_lock_init(void)
 247 {
 248 }
 249 #endif
 250
 251 static struct rt_hash_bucket    *rt_hash_table __read_mostly;
 252 static unsigned                 rt_hash_mask __read_mostly;
 253 static unsigned int             rt_hash_log  __read_mostly;
 254
 255 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
 256 #define RT_CACHE_STAT_INC(field) \
 257         (__raw_get_cpu_var(rt_cache_stat).field++)
 258
 259 static inline unsigned int rt_hash(__be32 daddr, __be32 saddr, int idx,
 260                 int genid)
 261 {
 262         return jhash_3words((__force u32)(__be32)(daddr),
 263                             (__force u32)(__be32)(saddr),
 264                             idx, genid)
 265                 & rt_hash_mask;
 266 }
 267
 268 static inline int rt_genid(struct net *net)
 269 {
 270         return atomic_read(&net->ipv4.rt_genid);
 271 }
 272
 273 #ifdef CONFIG_PROC_FS
 274 struct rt_cache_iter_state {
 275         struct seq_net_private p;
 276         int bucket;
 277         int genid;
 278 };
 279
 280 static struct rtable *rt_cache_get_first(struct seq_file *seq)
 281 {
 282         struct rt_cache_iter_state *st = seq->private;
 283         struct rtable *r = NULL;
 284
 285         for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
 286                 if (!rt_hash_table[st->bucket].chain)
 287                         continue;
 288                 rcu_read_lock_bh();
 289                 r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
 290                 while (r) {
 291                         if (dev_net(r->u.dst.dev) == seq_file_net(seq) &&
 292                             r->rt_genid == st->genid)
 293                                 return r;
 294                         r = rcu_dereference_bh(r->u.dst.rt_next);
 295                 }
 296                 rcu_read_unlock_bh();
 297         }
 298         return r;
 299 }
 300
 301 static struct rtable *__rt_cache_get_next(struct seq_file *seq,
 302                                           struct rtable *r)
 303 {
 304         struct rt_cache_iter_state *st = seq->private;
 305
 306         r = r->u.dst.rt_next;
 307         while (!r) {
 308                 rcu_read_unlock_bh();
 309                 do {
 310                         if (--st->bucket < 0)
 311                                 return NULL;
 312                 } while (!rt_hash_table[st->bucket].chain);
 313                 rcu_read_lock_bh();
 314                 r = rt_hash_table[st->bucket].chain;
 315         }
 316         return rcu_dereference_bh(r);
 317 }
 318
 319 static struct rtable *rt_cache_get_next(struct seq_file *seq,
 320                                         struct rtable *r)
 321 {
 322         struct rt_cache_iter_state *st = seq->private;
 323         while ((r = __rt_cache_get_next(seq, r)) != NULL) {
 324                 if (dev_net(r->u.dst.dev) != seq_file_net(seq))
 325                         continue;
 326                 if (r->rt_genid == st->genid)
 327                         break;
 328         }
 329         return r;
 330 }
 331
 332 static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
 333 {
 334         struct rtable *r = rt_cache_get_first(seq);
 335
 336         if (r)
 337                 while (pos && (r = rt_cache_get_next(seq, r)))
 338                         --pos;
 339         return pos ? NULL : r;
 340 }
 341
 342 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
 343 {
 344         struct rt_cache_iter_state *st = seq->private;
 345         if (*pos)
 346                 return rt_cache_get_idx(seq, *pos - 1);
 347         st->genid = rt_genid(seq_file_net(seq));
 348         return SEQ_START_TOKEN;
 349 }
 350
 351 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 352 {
 353         struct rtable *r;
 354
 355         if (v == SEQ_START_TOKEN)
 356                 r = rt_cache_get_first(seq);
 357         else
 358                 r = rt_cache_get_next(seq, v);
 359         ++*pos;
 360         return r;
 361 }
 362
 363 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
 364 {
 365         if (v && v != SEQ_START_TOKEN)
 366                 rcu_read_unlock_bh();
 367 }
 368
 369 static int rt_cache_seq_show(struct seq_file *seq, void *v)
 370 {
 371         if (v == SEQ_START_TOKEN)
 372                 seq_printf(seq, "%-127s\n",
 373                            "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
 374                            "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
 375                            "HHUptod\tSpecDst");
 376         else {
 377                 struct rtable *r = v;
 378                 int len;
 379
 380                 seq_printf(seq, "%s\t%08lX\t%08lX\t%8X\t%d\t%u\t%d\t"
 381                               "%08lX\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X%n",
 382                         r->u.dst.dev ? r->u.dst.dev->name : "*",
 383                         (unsigned long)r->rt_dst, (unsigned long)r->rt_gateway,
 384                         r->rt_flags, atomic_read(&r->u.dst.__refcnt),
 385                         r->u.dst.__use, 0, (unsigned long)r->rt_src,
 386                         (dst_metric(&r->u.dst, RTAX_ADVMSS) ?
 387                              (int)dst_metric(&r->u.dst, RTAX_ADVMSS) + 40 : 0),
 388                         dst_metric(&r->u.dst, RTAX_WINDOW),
 389                         (int)((dst_metric(&r->u.dst, RTAX_RTT) >> 3) +
 390                               dst_metric(&r->u.dst, RTAX_RTTVAR)),
 391                         r->fl.fl4_tos,
 392                         r->u.dst.hh ? atomic_read(&r->u.dst.hh->hh_refcnt) : -1,
 393                         r->u.dst.hh ? (r->u.dst.hh->hh_output ==
 394                                        dev_queue_xmit) : 0,
 395                         r->rt_spec_dst, &len);
 396
 397                 seq_printf(seq, "%*s\n", 127 - len, "");
 398         }
 399         return 0;
 400 }
 401
 402 static const struct seq_operations rt_cache_seq_ops = {
 403         .start  = rt_cache_seq_start,
 404         .next   = rt_cache_seq_next,
 405         .stop   = rt_cache_seq_stop,
 406         .show   = rt_cache_seq_show,
 407 };
 408
 409 static int rt_cache_seq_open(struct inode *inode, struct file *file)
 410 {
 411         return seq_open_net(inode, file, &rt_cache_seq_ops,
 412                         sizeof(struct rt_cache_iter_state));
 413 }
 414
 415 static const struct file_operations rt_cache_seq_fops = {
 416         .owner   = THIS_MODULE,
 417         .open    = rt_cache_seq_open,
 418         .read    = seq_read,
 419         .llseek  = seq_lseek,
 420         .release = seq_release_net,
 421 };
 422
 423
 424 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
 425 {
 426         int cpu;
 427
 428         if (*pos == 0)
 429                 return SEQ_START_TOKEN;
 430
 431         for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
 432                 if (!cpu_possible(cpu))
 433                         continue;
 434                 *pos = cpu+1;
 435                 return &per_cpu(rt_cache_stat, cpu);
 436         }
 437         return NULL;
 438 }
 439
 440 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 441 {
 442         int cpu;
 443
 444         for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
 445                 if (!cpu_possible(cpu))
 446                         continue;
 447                 *pos = cpu+1;
 448                 return &per_cpu(rt_cache_stat, cpu);
 449         }
 450         return NULL;
 451
 452 }
 453
 454 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
 455 {
 456
 457 }
 458
 459 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
 460 {
 461         struct rt_cache_stat *st = v;
 462
 463         if (v == SEQ_START_TOKEN) {
 464                 seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
 465                 return 0;
 466         }
 467
 468         seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
 469                    " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
 470                    atomic_read(&ipv4_dst_ops.entries),
 471                    st->in_hit,
 472                    st->in_slow_tot,
 473                    st->in_slow_mc,
 474                    st->in_no_route,
 475                    st->in_brd,
 476                    st->in_martian_dst,
 477                    st->in_martian_src,
 478
 479                    st->out_hit,
 480                    st->out_slow_tot,
 481                    st->out_slow_mc,
 482
 483                    st->gc_total,
 484                    st->gc_ignored,
 485                    st->gc_goal_miss,
 486                    st->gc_dst_overflow,
 487                    st->in_hlist_search,
 488                    st->out_hlist_search
 489                 );
 490         return 0;
 491 }
 492
 493 static const struct seq_operations rt_cpu_seq_ops = {
 494         .start  = rt_cpu_seq_start,
 495         .next   = rt_cpu_seq_next,
 496         .stop   = rt_cpu_seq_stop,
 497         .show   = rt_cpu_seq_show,
 498 };
 499
 500
 501 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
 502 {
 503         return seq_open(file, &rt_cpu_seq_ops);
 504 }
 505
 506 static const struct file_operations rt_cpu_seq_fops = {
 507         .owner   = THIS_MODULE,
 508         .open    = rt_cpu_seq_open,
 509         .read    = seq_read,
 510         .llseek  = seq_lseek,
 511         .release = seq_release,
 512 };
 513
 514 #ifdef CONFIG_NET_CLS_ROUTE
 515 static int rt_acct_proc_show(struct seq_file *m, void *v)
 516 {
 517         struct ip_rt_acct *dst, *src;
 518         unsigned int i, j;
 519
 520         dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
 521         if (!dst)
 522                 return -ENOMEM;
 523
 524         for_each_possible_cpu(i) {
 525                 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
 526                 for (j = 0; j < 256; j++) {
 527                         dst[j].o_bytes   += src[j].o_bytes;
 528                         dst[j].o_packets += src[j].o_packets;
 529                         dst[j].i_bytes   += src[j].i_bytes;
 530                         dst[j].i_packets += src[j].i_packets;
 531                 }
 532         }
 533
 534         seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
 535         kfree(dst);
 536         return 0;
 537 }
 538
 539 static int rt_acct_proc_open(struct inode *inode, struct file *file)
 540 {
 541         return single_open(file, rt_acct_proc_show, NULL);
 542 }
 543
 544 static const struct file_operations rt_acct_proc_fops = {
 545         .owner          = THIS_MODULE,
 546         .open           = rt_acct_proc_open,
 547         .read           = seq_read,
 548         .llseek         = seq_lseek,
 549         .release        = single_release,
 550 };
 551 #endif
 552
 553 static int __net_init ip_rt_do_proc_init(struct net *net)
 554 {
 555         struct proc_dir_entry *pde;
 556
 557         pde = proc_net_fops_create(net, "rt_cache", S_IRUGO,
 558                         &rt_cache_seq_fops);
 559         if (!pde)
 560                 goto err1;
 561
 562         pde = proc_create("rt_cache", S_IRUGO,
 563                           net->proc_net_stat, &rt_cpu_seq_fops);
 564         if (!pde)
 565                 goto err2;
 566
 567 #ifdef CONFIG_NET_CLS_ROUTE
 568         pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
 569         if (!pde)
 570                 goto err3;
 571 #endif
 572         return 0;
 573
 574 #ifdef CONFIG_NET_CLS_ROUTE
 575 err3:
 576         remove_proc_entry("rt_cache", net->proc_net_stat);
 577 #endif
 578 err2:
 579         remove_proc_entry("rt_cache", net->proc_net);
 580 err1:
 581         return -ENOMEM;
 582 }
 583
 584 static void __net_exit ip_rt_do_proc_exit(struct net *net)
 585 {
 586         remove_proc_entry("rt_cache", net->proc_net_stat);
 587         remove_proc_entry("rt_cache", net->proc_net);
 588 #ifdef CONFIG_NET_CLS_ROUTE
 589         remove_proc_entry("rt_acct", net->proc_net);
 590 #endif
 591 }
 592
 593 static struct pernet_operations ip_rt_proc_ops __net_initdata =  {
 594         .init = ip_rt_do_proc_init,
 595         .exit = ip_rt_do_proc_exit,
 596 };
 597
 598 static int __init ip_rt_proc_init(void)
 599 {
 600         return register_pernet_subsys(&ip_rt_proc_ops);
 601 }
 602
 603 #else
 604 static inline int ip_rt_proc_init(void)
 605 {
 606         return 0;
 607 }
 608 #endif /* CONFIG_PROC_FS */
 609
 610 static inline void rt_free(struct rtable *rt)
 611 {
 612         call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
 613 }
 614
 615 static inline void rt_drop(struct rtable *rt)
 616 {
 617         ip_rt_put(rt);
 618         call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
 619 }
 620
 621 static inline int rt_fast_clean(struct rtable *rth)
 622 {
 623         /* Kill broadcast/multicast entries very aggresively, if they
 624            collide in hash table with more useful entries */
 625         return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
 626                 rth->fl.iif && rth->u.dst.rt_next;
 627 }
 628
 629 static inline int rt_valuable(struct rtable *rth)
 630 {
 631         return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
 632                 rth->u.dst.expires;
 633 }
 634
 635 static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
 636 {
 637         unsigned long age;
 638         int ret = 0;
 639
 640         if (atomic_read(&rth->u.dst.__refcnt))
 641                 goto out;
 642
 643         ret = 1;
 644         if (rth->u.dst.expires &&
 645             time_after_eq(jiffies, rth->u.dst.expires))
 646                 goto out;
 647
 648         age = jiffies - rth->u.dst.lastuse;
 649         ret = 0;
 650         if ((age <= tmo1 && !rt_fast_clean(rth)) ||
 651             (age <= tmo2 && rt_valuable(rth)))
 652                 goto out;
 653         ret = 1;
 654 out:    return ret;
 655 }
 656
 657 /* Bits of score are:
 658  * 31: very valuable
 659  * 30: not quite useless
 660  * 29..0: usage counter
 661  */
 662 static inline u32 rt_score(struct rtable *rt)
 663 {
 664         u32 score = jiffies - rt->u.dst.lastuse;
 665
 666         score = ~score & ~(3<<30);
 667
 668         if (rt_valuable(rt))
 669                 score |= (1<<31);
 670
 671         if (!rt->fl.iif ||
 672             !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
 673                 score |= (1<<30);
 674
 675         return score;
 676 }
 677
 678 static inline bool rt_caching(const struct net *net)
 679 {
 680         return net->ipv4.current_rt_cache_rebuild_count <=
 681                 net->ipv4.sysctl_rt_cache_rebuild_count;
 682 }
 683
 684 static inline bool compare_hash_inputs(const struct flowi *fl1,
 685                                         const struct flowi *fl2)
 686 {
 687         return (__force u32)(((fl1->nl_u.ip4_u.daddr ^ fl2->nl_u.ip4_u.daddr) |
 688                 (fl1->nl_u.ip4_u.saddr ^ fl2->nl_u.ip4_u.saddr) |
 689                 (fl1->iif ^ fl2->iif)) == 0);
 690 }
 691
 692 static inline int compare_keys(struct flowi *fl1, struct flowi *fl2)
 693 {
 694         return ((__force u32)((fl1->nl_u.ip4_u.daddr ^ fl2->nl_u.ip4_u.daddr) |
 695                 (fl1->nl_u.ip4_u.saddr ^ fl2->nl_u.ip4_u.saddr)) |
 696                 (fl1->mark ^ fl2->mark) |
 697                 (*(u16 *)&fl1->nl_u.ip4_u.tos ^
 698                  *(u16 *)&fl2->nl_u.ip4_u.tos) |
 699                 (fl1->oif ^ fl2->oif) |
 700                 (fl1->iif ^ fl2->iif)) == 0;
 701 }
 702
 703 static inline int compare_netns(struct rtable *rt1, struct rtable *rt2)
 704 {
 705         return net_eq(dev_net(rt1->u.dst.dev), dev_net(rt2->u.dst.dev));
 706 }
 707
 708 static inline int rt_is_expired(struct rtable *rth)
 709 {
 710         return rth->rt_genid != rt_genid(dev_net(rth->u.dst.dev));
 711 }
 712
 713 /*
 714  * Perform a full scan of hash table and free all entries.
 715  * Can be called by a softirq or a process.
 716  * In the later case, we want to be reschedule if necessary
 717  */
 718 static void rt_do_flush(int process_context)
 719 {
 720         unsigned int i;
 721         struct rtable *rth, *next;
 722         struct rtable * tail;
 723
 724         for (i = 0; i <= rt_hash_mask; i++) {
 725                 if (process_context && need_resched())
 726                         cond_resched();
 727                 rth = rt_hash_table[i].chain;
 728                 if (!rth)
 729                         continue;
 730
 731                 spin_lock_bh(rt_hash_lock_addr(i));
 732 #ifdef CONFIG_NET_NS
 733                 {
 734                 struct rtable ** prev, * p;
 735
 736                 rth = rt_hash_table[i].chain;
 737
 738                 /* defer releasing the head of the list after spin_unlock */
 739                 for (tail = rth; tail; tail = tail->u.dst.rt_next)
 740                         if (!rt_is_expired(tail))
 741                                 break;
 742                 if (rth != tail)
 743                         rt_hash_table[i].chain = tail;
 744
 745                 /* call rt_free on entries after the tail requiring flush */
 746                 prev = &rt_hash_table[i].chain;
 747                 for (p = *prev; p; p = next) {
 748                         next = p->u.dst.rt_next;
 749                         if (!rt_is_expired(p)) {
 750                                 prev = &p->u.dst.rt_next;
 751                         } else {
 752                                 *prev = next;
 753                                 rt_free(p);
 754                         }
 755                 }
 756                 }
 757 #else
 758                 rth = rt_hash_table[i].chain;
 759                 rt_hash_table[i].chain = NULL;
 760                 tail = NULL;
 761 #endif
 762                 spin_unlock_bh(rt_hash_lock_addr(i));
 763
 764                 for (; rth != tail; rth = next) {
 765                         next = rth->u.dst.rt_next;
 766                         rt_free(rth);
 767                 }
 768         }
 769 }
 770
 771 /*
 772  * While freeing expired entries, we compute average chain length
 773  * and standard deviation, using fixed-point arithmetic.
 774  * This to have an estimation of rt_chain_length_max
 775  *  rt_chain_length_max = max(elasticity, AVG + 4*SD)
 776  * We use 3 bits for frational part, and 29 (or 61) for magnitude.
 777  */
 778
 779 #define FRACT_BITS 3
 780 #define ONE (1UL << FRACT_BITS)
 781
 782 /*
 783  * Given a hash chain and an item in this hash chain,
 784  * find if a previous entry has the same hash_inputs
 785  * (but differs on tos, mark or oif)
 786  * Returns 0 if an alias is found.
 787  * Returns ONE if rth has no alias before itself.
 788  */
 789 static int has_noalias(const struct rtable *head, const struct rtable *rth)
 790 {
 791         const struct rtable *aux = head;
 792
 793         while (aux != rth) {
 794                 if (compare_hash_inputs(&aux->fl, &rth->fl))
 795                         return 0;
 796                 aux = aux->u.dst.rt_next;
 797         }
 798         return ONE;
 799 }
 800
 801 static void rt_check_expire(void)
 802 {
 803         static unsigned int rover;
 804         unsigned int i = rover, goal;
 805         struct rtable *rth, **rthp;
 806         unsigned long samples = 0;
 807         unsigned long sum = 0, sum2 = 0;
 808         unsigned long delta;
 809         u64 mult;
 810
 811         delta = jiffies - expires_ljiffies;
 812         expires_ljiffies = jiffies;
 813         mult = ((u64)delta) << rt_hash_log;
 814         if (ip_rt_gc_timeout > 1)
 815                 do_div(mult, ip_rt_gc_timeout);
 816         goal = (unsigned int)mult;
 817         if (goal > rt_hash_mask)
 818                 goal = rt_hash_mask + 1;
 819         for (; goal > 0; goal--) {
 820                 unsigned long tmo = ip_rt_gc_timeout;
 821                 unsigned long length;
 822
 823                 i = (i + 1) & rt_hash_mask;
 824                 rthp = &rt_hash_table[i].chain;
 825
 826                 if (need_resched())
 827                         cond_resched();
 828
 829                 samples++;
 830
 831                 if (*rthp == NULL)
 832                         continue;
 833                 length = 0;
 834                 spin_lock_bh(rt_hash_lock_addr(i));
 835                 while ((rth = *rthp) != NULL) {
 836                         prefetch(rth->u.dst.rt_next);
 837                         if (rt_is_expired(rth)) {
 838                                 *rthp = rth->u.dst.rt_next;
 839                                 rt_free(rth);
 840                                 continue;
 841                         }
 842                         if (rth->u.dst.expires) {
 843                                 /* Entry is expired even if it is in use */
 844                                 if (time_before_eq(jiffies, rth->u.dst.expires)) {
 845 nofree:
 846                                         tmo >>= 1;
 847                                         rthp = &rth->u.dst.rt_next;
 848                                         /*
 849                                          * We only count entries on
 850                                          * a chain with equal hash inputs once
 851                                          * so that entries for different QOS
 852                                          * levels, and other non-hash input
 853                                          * attributes don't unfairly skew
 854                                          * the length computation
 855                                          */
 856                                         length += has_noalias(rt_hash_table[i].chain, rth);
 857                                         continue;
 858                                 }
 859                         } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout))
 860                                 goto nofree;
 861
 862                         /* Cleanup aged off entries. */
 863                         *rthp = rth->u.dst.rt_next;
 864                         rt_free(rth);
 865                 }
 866                 spin_unlock_bh(rt_hash_lock_addr(i));
 867                 sum += length;
 868                 sum2 += length*length;
 869         }
 870         if (samples) {
 871                 unsigned long avg = sum / samples;
 872                 unsigned long sd = int_sqrt(sum2 / samples - avg*avg);
 873                 rt_chain_length_max = max_t(unsigned long,
 874                                         ip_rt_gc_elasticity,
 875                                         (avg + 4*sd) >> FRACT_BITS);
 876         }
 877         rover = i;
 878 }
 879
 880 /*
 881  * rt_worker_func() is run in process context.
 882  * we call rt_check_expire() to scan part of the hash table
 883  */
 884 static void rt_worker_func(struct work_struct *work)
 885 {
 886         rt_check_expire();
 887         schedule_delayed_work(&expires_work, ip_rt_gc_interval);
 888 }
 889
 890 /*
 891  * Pertubation of rt_genid by a small quantity [1..256]
 892  * Using 8 bits of shuffling ensure we can call rt_cache_invalidate()
 893  * many times (2^24) without giving recent rt_genid.
 894  * Jenkins hash is strong enough that litle changes of rt_genid are OK.
 895  */
 896 static void rt_cache_invalidate(struct net *net)
 897 {
 898         unsigned char shuffle;
 899
 900         get_random_bytes(&shuffle, sizeof(shuffle));
 901         atomic_add(shuffle + 1U, &net->ipv4.rt_genid);
 902 }
 903
 904 /*
 905  * delay < 0  : invalidate cache (fast : entries will be deleted later)
 906  * delay >= 0 : invalidate & flush cache (can be long)
 907  */
 908 void rt_cache_flush(struct net *net, int delay)
 909 {
 910         rt_cache_invalidate(net);
 911         if (delay >= 0)
 912                 rt_do_flush(!in_softirq());
 913 }
 914
 915 /* Flush previous cache invalidated entries from the cache */
 916 void rt_cache_flush_batch(void)
 917 {
 918         rt_do_flush(!in_softirq());
 919 }
 920
 921 /*
 922  * We change rt_genid and let gc do the cleanup
 923  */
 924 static void rt_secret_rebuild(unsigned long __net)
 925 {
 926         struct net *net = (struct net *)__net;
 927         rt_cache_invalidate(net);
 928         mod_timer(&net->ipv4.rt_secret_timer, jiffies + ip_rt_secret_interval);
 929 }
 930
 931 static void rt_secret_rebuild_oneshot(struct net *net)
 932 {
 933         del_timer_sync(&net->ipv4.rt_secret_timer);
 934         rt_cache_invalidate(net);
 935         if (ip_rt_secret_interval)
 936                 mod_timer(&net->ipv4.rt_secret_timer, jiffies + ip_rt_secret_interval);
 937 }
 938
 939 static void rt_emergency_hash_rebuild(struct net *net)
 940 {
 941         if (net_ratelimit()) {
 942                 printk(KERN_WARNING "Route hash chain too long!\n");
 943                 printk(KERN_WARNING "Adjust your secret_interval!\n");
 944         }
 945
 946         rt_secret_rebuild_oneshot(net);
 947 }
 948
 949 /*
 950    Short description of GC goals.
 951
 952    We want to build algorithm, which will keep routing cache
 953    at some equilibrium point, when number of aged off entries
 954    is kept approximately equal to newly generated ones.
 955
 956    Current expiration strength is variable "expire".
 957    We try to adjust it dynamically, so that if networking
 958    is idle expires is large enough to keep enough of warm entries,
 959    and when load increases it reduces to limit cache size.
 960  */
 961
 962 static int rt_garbage_collect(struct dst_ops *ops)
 963 {
 964         static unsigned long expire = RT_GC_TIMEOUT;
 965         static unsigned long last_gc;
 966         static int rover;
 967         static int equilibrium;
 968         struct rtable *rth, **rthp;
 969         unsigned long now = jiffies;
 970         int goal;
 971
 972         /*
 973          * Garbage collection is pretty expensive,
 974          * do not make it too frequently.
 975          */
 976
 977         RT_CACHE_STAT_INC(gc_total);
 978
 979         if (now - last_gc < ip_rt_gc_min_interval &&
 980             atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size) {
 981                 RT_CACHE_STAT_INC(gc_ignored);
 982                 goto out;
 983         }
 984
 985         /* Calculate number of entries, which we want to expire now. */
 986         goal = atomic_read(&ipv4_dst_ops.entries) -
 987                 (ip_rt_gc_elasticity << rt_hash_log);
 988         if (goal <= 0) {
 989                 if (equilibrium < ipv4_dst_ops.gc_thresh)
 990                         equilibrium = ipv4_dst_ops.gc_thresh;
 991                 goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
 992                 if (goal > 0) {
 993                         equilibrium += min_t(unsigned int, goal >> 1, rt_hash_mask + 1);
 994                         goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
 995                 }
 996         } else {
 997                 /* We are in dangerous area. Try to reduce cache really
 998                  * aggressively.
 999                  */
1000                 goal = max_t(unsigned int, goal >> 1, rt_hash_mask + 1);
1001                 equilibrium = atomic_read(&ipv4_dst_ops.entries) - goal;
1002         }
1003
1004         if (now - last_gc >= ip_rt_gc_min_interval)
1005                 last_gc = now;
1006
1007         if (goal <= 0) {
1008                 equilibrium += goal;
1009                 goto work_done;
1010         }
1011
1012         do {
1013                 int i, k;
1014
1015                 for (i = rt_hash_mask, k = rover; i >= 0; i--) {
1016                         unsigned long tmo = expire;
1017
1018                         k = (k + 1) & rt_hash_mask;
1019                         rthp = &rt_hash_table[k].chain;
1020                         spin_lock_bh(rt_hash_lock_addr(k));
1021                         while ((rth = *rthp) != NULL) {
1022                                 if (!rt_is_expired(rth) &&
1023                                         !rt_may_expire(rth, tmo, expire)) {
1024                                         tmo >>= 1;
1025                                         rthp = &rth->u.dst.rt_next;
1026                                         continue;
1027                                 }
1028                                 *rthp = rth->u.dst.rt_next;
1029                                 rt_free(rth);
1030                                 goal--;
1031                         }
1032                         spin_unlock_bh(rt_hash_lock_addr(k));
1033                         if (goal <= 0)
1034                                 break;
1035                 }
1036                 rover = k;
1037
1038                 if (goal <= 0)
1039                         goto work_done;
1040
1041                 /* Goal is not achieved. We stop process if:
1042
1043                    - if expire reduced to zero. Otherwise, expire is halfed.
1044                    - if table is not full.
1045                    - if we are called from interrupt.
1046                    - jiffies check is just fallback/debug loop breaker.
1047                      We will not spin here for long time in any case.
1048                  */
1049
1050                 RT_CACHE_STAT_INC(gc_goal_miss);
1051
1052                 if (expire == 0)
1053                         break;
1054
1055                 expire >>= 1;
1056 #if RT_CACHE_DEBUG >= 2
1057                 printk(KERN_DEBUG "expire>> %u %d %d %d\n", expire,
1058                                 atomic_read(&ipv4_dst_ops.entries), goal, i);
1059 #endif
1060
1061                 if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
1062                         goto out;
1063         } while (!in_softirq() && time_before_eq(jiffies, now));
1064
1065         if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
1066                 goto out;
1067         if (net_ratelimit())
1068                 printk(KERN_WARNING "dst cache overflow\n");
1069         RT_CACHE_STAT_INC(gc_dst_overflow);
1070         return 1;
1071
1072 work_done:
1073         expire += ip_rt_gc_min_interval;
1074         if (expire > ip_rt_gc_timeout ||
1075             atomic_read(&ipv4_dst_ops.entries) < ipv4_dst_ops.gc_thresh)
1076                 expire = ip_rt_gc_timeout;
1077 #if RT_CACHE_DEBUG >= 2
1078         printk(KERN_DEBUG "expire++ %u %d %d %d\n", expire,
1079                         atomic_read(&ipv4_dst_ops.entries), goal, rover);
1080 #endif
1081 out:    return 0;
1082 }
1083
1084 /*
1085  * Returns number of entries in a hash chain that have different hash_inputs
1086  */
1087 static int slow_chain_length(const struct rtable *head)
1088 {
1089         int length = 0;
1090         const struct rtable *rth = head;
1091
1092         while (rth) {
1093                 length += has_noalias(head, rth);
1094                 rth = rth->u.dst.rt_next;
1095         }
1096         return length >> FRACT_BITS;
1097 }
1098
1099 static int rt_intern_hash(unsigned hash, struct rtable *rt,
1100                           struct rtable **rp, struct sk_buff *skb)
1101 {
1102         struct rtable   *rth, **rthp;
1103         unsigned long   now;
1104         struct rtable *cand, **candp;
1105         u32             min_score;
1106         int             chain_length;
1107         int attempts = !in_softirq();
1108
1109 restart:
1110         chain_length = 0;
1111         min_score = ~(u32)0;
1112         cand = NULL;
1113         candp = NULL;
1114         now = jiffies;
1115
1116         if (!rt_caching(dev_net(rt->u.dst.dev))) {
1117                 /*
1118                  * If we're not caching, just tell the caller we
1119                  * were successful and don't touch the route.  The
1120                  * caller hold the sole reference to the cache entry, and
1121                  * it will be released when the caller is done with it.
1122                  * If we drop it here, the callers have no way to resolve routes
1123                  * when we're not caching.  Instead, just point *rp at rt, so
1124                  * the caller gets a single use out of the route
1125                  * Note that we do rt_free on this new route entry, so that
1126                  * once its refcount hits zero, we are still able to reap it
1127                  * (Thanks Alexey)
1128                  * Note also the rt_free uses call_rcu.  We don't actually
1129                  * need rcu protection here, this is just our path to get
1130                  * on the route gc list.
1131                  */
1132
1133                 if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) {
1134                         int err = arp_bind_neighbour(&rt->u.dst);
1135                         if (err) {
1136                                 if (net_ratelimit())
1137                                         printk(KERN_WARNING
1138                                             "Neighbour table failure & not caching routes.\n");
1139                                 rt_drop(rt);
1140                                 return err;
1141                         }
1142                 }
1143
1144                 rt_free(rt);
1145                 goto skip_hashing;
1146         }
1147
1148         rthp = &rt_hash_table[hash].chain;
1149
1150         spin_lock_bh(rt_hash_lock_addr(hash));
1151         while ((rth = *rthp) != NULL) {
1152                 if (rt_is_expired(rth)) {
1153                         *rthp = rth->u.dst.rt_next;
1154                         rt_free(rth);
1155                         continue;
1156                 }
1157                 if (compare_keys(&rth->fl, &rt->fl) && compare_netns(rth, rt)) {
1158                         /* Put it first */
1159                         *rthp = rth->u.dst.rt_next;
1160                         /*
1161                          * Since lookup is lockfree, the deletion
1162                          * must be visible to another weakly ordered CPU before
1163                          * the insertion at the start of the hash chain.
1164                          */
1165                         rcu_assign_pointer(rth->u.dst.rt_next,
1166                                            rt_hash_table[hash].chain);
1167                         /*
1168                          * Since lookup is lockfree, the update writes
1169                          * must be ordered for consistency on SMP.
1170                          */
1171                         rcu_assign_pointer(rt_hash_table[hash].chain, rth);
1172
1173                         dst_use(&rth->u.dst, now);
1174                         spin_unlock_bh(rt_hash_lock_addr(hash));
1175
1176                         rt_drop(rt);
1177                         if (rp)
1178                                 *rp = rth;
1179                         else
1180                                 skb_dst_set(skb, &rth->u.dst);
1181                         return 0;
1182                 }
1183
1184                 if (!atomic_read(&rth->u.dst.__refcnt)) {
1185                         u32 score = rt_score(rth);
1186
1187                         if (score <= min_score) {
1188                                 cand = rth;
1189                                 candp = rthp;
1190                                 min_score = score;
1191                         }
1192                 }
1193
1194                 chain_length++;
1195
1196                 rthp = &rth->u.dst.rt_next;
1197         }
1198
1199         if (cand) {
1200                 /* ip_rt_gc_elasticity used to be average length of chain
1201                  * length, when exceeded gc becomes really aggressive.
1202                  *
1203                  * The second limit is less certain. At the moment it allows
1204                  * only 2 entries per bucket. We will see.
1205                  */
1206                 if (chain_length > ip_rt_gc_elasticity) {
1207                         *candp = cand->u.dst.rt_next;
1208                         rt_free(cand);
1209                 }
1210         } else {
1211                 if (chain_length > rt_chain_length_max &&
1212                     slow_chain_length(rt_hash_table[hash].chain) > rt_chain_length_max) {
1213                         struct net *net = dev_net(rt->u.dst.dev);
1214                         int num = ++net->ipv4.current_rt_cache_rebuild_count;
1215                         if (!rt_caching(dev_net(rt->u.dst.dev))) {
1216                                 printk(KERN_WARNING "%s: %d rebuilds is over limit, route caching disabled\n",
1217                                         rt->u.dst.dev->name, num);
1218                         }
1219                         rt_emergency_hash_rebuild(dev_net(rt->u.dst.dev));
1220                 }
1221         }
1222
1223         /* Try to bind route to arp only if it is output
1224            route or unicast forwarding path.
1225          */
1226         if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) {
1227                 int err = arp_bind_neighbour(&rt->u.dst);
1228                 if (err) {
1229                         spin_unlock_bh(rt_hash_lock_addr(hash));
1230
1231                         if (err != -ENOBUFS) {
1232                                 rt_drop(rt);
1233                                 return err;
1234                         }
1235
1236                         /* Neighbour tables are full and nothing
1237                            can be released. Try to shrink route cache,
1238                            it is most likely it holds some neighbour records.
1239                          */
1240                         if (attempts-- > 0) {
1241                                 int saved_elasticity = ip_rt_gc_elasticity;
1242                                 int saved_int = ip_rt_gc_min_interval;
1243                                 ip_rt_gc_elasticity     = 1;
1244                                 ip_rt_gc_min_interval   = 0;
1245                                 rt_garbage_collect(&ipv4_dst_ops);
1246                                 ip_rt_gc_min_interval   = saved_int;
1247                                 ip_rt_gc_elasticity     = saved_elasticity;
1248                                 goto restart;
1249                         }
1250
1251                         if (net_ratelimit())
1252                                 printk(KERN_WARNING "Neighbour table overflow.\n");
1253                         rt_drop(rt);
1254                         return -ENOBUFS;
1255                 }
1256         }
1257
1258         rt->u.dst.rt_next = rt_hash_table[hash].chain;
1259
1260 #if RT_CACHE_DEBUG >= 2
1261         if (rt->u.dst.rt_next) {
1262                 struct rtable *trt;
1263                 printk(KERN_DEBUG "rt_cache @%02x: %pI4",
1264                        hash, &rt->rt_dst);
1265                 for (trt = rt->u.dst.rt_next; trt; trt = trt->u.dst.rt_next)
1266                         printk(" . %pI4", &trt->rt_dst);
1267                 printk("\n");
1268         }
1269 #endif
1270         /*
1271          * Since lookup is lockfree, we must make sure
1272          * previous writes to rt are comitted to memory
1273          * before making rt visible to other CPUS.
1274          */
1275         rcu_assign_pointer(rt_hash_table[hash].chain, rt);
1276
1277         spin_unlock_bh(rt_hash_lock_addr(hash));
1278
1279 skip_hashing:
1280         if (rp)
1281                 *rp = rt;
1282         else
1283                 skb_dst_set(skb, &rt->u.dst);
1284         return 0;
1285 }
1286
1287 void rt_bind_peer(struct rtable *rt, int create)
1288 {
1289         static DEFINE_SPINLOCK(rt_peer_lock);
1290         struct inet_peer *peer;
1291
1292         peer = inet_getpeer(rt->rt_dst, create);
1293
1294         spin_lock_bh(&rt_peer_lock);
1295         if (rt->peer == NULL) {
1296                 rt->peer = peer;
1297                 peer = NULL;
1298         }
1299         spin_unlock_bh(&rt_peer_lock);
1300         if (peer)
1301                 inet_putpeer(peer);
1302 }
1303
1304 /*
1305  * Peer allocation may fail only in serious out-of-memory conditions.  However
1306  * we still can generate some output.
1307  * Random ID selection looks a bit dangerous because we have no chances to
1308  * select ID being unique in a reasonable period of time.
1309  * But broken packet identifier may be better than no packet at all.
1310  */
1311 static void ip_select_fb_ident(struct iphdr *iph)
1312 {
1313         static DEFINE_SPINLOCK(ip_fb_id_lock);
1314         static u32 ip_fallback_id;
1315         u32 salt;
1316
1317         spin_lock_bh(&ip_fb_id_lock);
1318         salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
1319         iph->id = htons(salt & 0xFFFF);
1320         ip_fallback_id = salt;
1321         spin_unlock_bh(&ip_fb_id_lock);
1322 }
1323
1324 void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
1325 {
1326         struct rtable *rt = (struct rtable *) dst;
1327
1328         if (rt) {
1329                 if (rt->peer == NULL)
1330                         rt_bind_peer(rt, 1);
1331
1332                 /* If peer is attached to destination, it is never detached,
1333                    so that we need not to grab a lock to dereference it.
1334                  */
1335                 if (rt->peer) {
1336                         iph->id = htons(inet_getid(rt->peer, more));
1337                         return;
1338                 }
1339         } else
1340                 printk(KERN_DEBUG "rt_bind_peer(0) @%p\n",
1341                        __builtin_return_address(0));
1342
1343         ip_select_fb_ident(iph);
1344 }
1345
1346 static void rt_del(unsigned hash, struct rtable *rt)
1347 {
1348         struct rtable **rthp, *aux;
1349
1350         rthp = &rt_hash_table[hash].chain;
1351         spin_lock_bh(rt_hash_lock_addr(hash));
1352         ip_rt_put(rt);
1353         while ((aux = *rthp) != NULL) {
1354                 if (aux == rt || rt_is_expired(aux)) {
1355                         *rthp = aux->u.dst.rt_next;
1356                         rt_free(aux);
1357                         continue;
1358                 }
1359                 rthp = &aux->u.dst.rt_next;
1360         }
1361         spin_unlock_bh(rt_hash_lock_addr(hash));
1362 }
1363
1364 void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1365                     __be32 saddr, struct net_device *dev)
1366 {
1367         int i, k;
1368         struct in_device *in_dev = in_dev_get(dev);
1369         struct rtable *rth, **rthp;
1370         __be32  skeys[2] = { saddr, 0 };
1371         int  ikeys[2] = { dev->ifindex, 0 };
1372         struct netevent_redirect netevent;
1373         struct net *net;
1374
1375         if (!in_dev)
1376                 return;
1377
1378         net = dev_net(dev);
1379         if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
1380             ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
1381             ipv4_is_zeronet(new_gw))
1382                 goto reject_redirect;
1383
1384         if (!rt_caching(net))
1385                 goto reject_redirect;
1386
1387         if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1388                 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1389                         goto reject_redirect;
1390                 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1391                         goto reject_redirect;
1392         } else {
1393                 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
1394                         goto reject_redirect;
1395         }
1396
1397         for (i = 0; i < 2; i++) {
1398                 for (k = 0; k < 2; k++) {
1399                         unsigned hash = rt_hash(daddr, skeys[i], ikeys[k],
1400                                                 rt_genid(net));
1401
1402                         rthp=&rt_hash_table[hash].chain;
1403
1404                         rcu_read_lock();
1405                         while ((rth = rcu_dereference(*rthp)) != NULL) {
1406                                 struct rtable *rt;
1407
1408                                 if (rth->fl.fl4_dst != daddr ||
1409                                     rth->fl.fl4_src != skeys[i] ||
1410                                     rth->fl.oif != ikeys[k] ||
1411                                     rth->fl.iif != 0 ||
1412                                     rt_is_expired(rth) ||
1413                                     !net_eq(dev_net(rth->u.dst.dev), net)) {
1414                                         rthp = &rth->u.dst.rt_next;
1415                                         continue;
1416                                 }
1417
1418                                 if (rth->rt_dst != daddr ||
1419                                     rth->rt_src != saddr ||
1420                                     rth->u.dst.error ||
1421                                     rth->rt_gateway != old_gw ||
1422                                     rth->u.dst.dev != dev)
1423                                         break;
1424
1425                                 dst_hold(&rth->u.dst);
1426                                 rcu_read_unlock();
1427
1428                                 rt = dst_alloc(&ipv4_dst_ops);
1429                                 if (rt == NULL) {
1430                                         ip_rt_put(rth);
1431                                         in_dev_put(in_dev);
1432                                         return;
1433                                 }
1434
1435                                 /* Copy all the information. */
1436                                 *rt = *rth;
1437                                 rt->u.dst.__use         = 1;
1438                                 atomic_set(&rt->u.dst.__refcnt, 1);
1439                                 rt->u.dst.child         = NULL;
1440                                 if (rt->u.dst.dev)
1441                                         dev_hold(rt->u.dst.dev);
1442                                 if (rt->idev)
1443                                         in_dev_hold(rt->idev);
1444                                 rt->u.dst.obsolete      = 0;
1445                                 rt->u.dst.lastuse       = jiffies;
1446                                 rt->u.dst.path          = &rt->u.dst;
1447                                 rt->u.dst.neighbour     = NULL;
1448                                 rt->u.dst.hh            = NULL;
1449 #ifdef CONFIG_XFRM
1450                                 rt->u.dst.xfrm          = NULL;
1451 #endif
1452                                 rt->rt_genid            = rt_genid(net);
1453                                 rt->rt_flags            |= RTCF_REDIRECTED;
1454
1455                                 /* Gateway is different ... */
1456                                 rt->rt_gateway          = new_gw;
1457
1458                                 /* Redirect received -> path was valid */
1459                                 dst_confirm(&rth->u.dst);
1460
1461                                 if (rt->peer)
1462                                         atomic_inc(&rt->peer->refcnt);
1463
1464                                 if (arp_bind_neighbour(&rt->u.dst) ||
1465                                     !(rt->u.dst.neighbour->nud_state &
1466                                             NUD_VALID)) {
1467                                         if (rt->u.dst.neighbour)
1468                                                 neigh_event_send(rt->u.dst.neighbour, NULL);
1469                                         ip_rt_put(rth);
1470                                         rt_drop(rt);
1471                                         goto do_next;
1472                                 }
1473
1474                                 netevent.old = &rth->u.dst;
1475                                 netevent.new = &rt->u.dst;
1476                                 call_netevent_notifiers(NETEVENT_REDIRECT,
1477                                                         &netevent);
1478
1479                                 rt_del(hash, rth);
1480                                 if (!rt_intern_hash(hash, rt, &rt, NULL))
1481                                         ip_rt_put(rt);
1482                                 goto do_next;
1483                         }
1484                         rcu_read_unlock();
1485                 do_next:
1486                         ;
1487                 }
1488         }
1489         in_dev_put(in_dev);
1490         return;
1491
1492 reject_redirect:
1493 #ifdef CONFIG_IP_ROUTE_VERBOSE
1494         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1495                 printk(KERN_INFO "Redirect from %pI4 on %s about %pI4 ignored.\n"
1496                         "  Advised path = %pI4 -> %pI4\n",
1497                        &old_gw, dev->name, &new_gw,
1498                        &saddr, &daddr);
1499 #endif
1500         in_dev_put(in_dev);
1501 }
1502
1503 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1504 {
1505         struct rtable *rt = (struct rtable *)dst;
1506         struct dst_entry *ret = dst;
1507
1508         if (rt) {
1509                 if (dst->obsolete) {
1510                         ip_rt_put(rt);
1511                         ret = NULL;
1512                 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
1513                            rt->u.dst.expires) {
1514                         unsigned hash = rt_hash(rt->fl.fl4_dst, rt->fl.fl4_src,
1515                                                 rt->fl.oif,
1516                                                 rt_genid(dev_net(dst->dev)));
1517 #if RT_CACHE_DEBUG >= 1
1518                         printk(KERN_DEBUG "ipv4_negative_advice: redirect to %pI4/%02x dropped\n",
1519                                 &rt->rt_dst, rt->fl.fl4_tos);
1520 #endif
1521                         rt_del(hash, rt);
1522                         ret = NULL;
1523                 }
1524         }
1525         return ret;
1526 }
1527
1528 /*
1529  * Algorithm:
1530  *      1. The first ip_rt_redirect_number redirects are sent
1531  *         with exponential backoff, then we stop sending them at all,
1532  *         assuming that the host ignores our redirects.
1533  *      2. If we did not see packets requiring redirects
1534  *         during ip_rt_redirect_silence, we assume that the host
1535  *         forgot redirected route and start to send redirects again.
1536  *
1537  * This algorithm is much cheaper and more intelligent than dumb load limiting
1538  * in icmp.c.
1539  *
1540  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1541  * and "frag. need" (breaks PMTU discovery) in icmp.c.
1542  */
1543
1544 void ip_rt_send_redirect(struct sk_buff *skb)
1545 {
1546         struct rtable *rt = skb_rtable(skb);
1547         struct in_device *in_dev;
1548         int log_martians;
1549
1550         rcu_read_lock();
1551         in_dev = __in_dev_get_rcu(rt->u.dst.dev);
1552         if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
1553                 rcu_read_unlock();
1554                 return;
1555         }
1556         log_martians = IN_DEV_LOG_MARTIANS(in_dev);
1557         rcu_read_unlock();
1558
1559         /* No redirected packets during ip_rt_redirect_silence;
1560          * reset the algorithm.
1561          */
1562         if (time_after(jiffies, rt->u.dst.rate_last + ip_rt_redirect_silence))
1563                 rt->u.dst.rate_tokens = 0;
1564
1565         /* Too many ignored redirects; do not send anything
1566          * set u.dst.rate_last to the last seen redirected packet.
1567          */
1568         if (rt->u.dst.rate_tokens >= ip_rt_redirect_number) {
1569                 rt->u.dst.rate_last = jiffies;
1570                 return;
1571         }
1572
1573         /* Check for load limit; set rate_last to the latest sent
1574          * redirect.
1575          */
1576         if (rt->u.dst.rate_tokens == 0 ||
1577             time_after(jiffies,
1578                        (rt->u.dst.rate_last +
1579                         (ip_rt_redirect_load << rt->u.dst.rate_tokens)))) {
1580                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1581                 rt->u.dst.rate_last = jiffies;
1582                 ++rt->u.dst.rate_tokens;
1583 #ifdef CONFIG_IP_ROUTE_VERBOSE
1584                 if (log_martians &&
1585                     rt->u.dst.rate_tokens == ip_rt_redirect_number &&
1586                     net_ratelimit())
1587                         printk(KERN_WARNING "host %pI4/if%d ignores redirects for %pI4 to %pI4.\n",
1588                                 &rt->rt_src, rt->rt_iif,
1589                                 &rt->rt_dst, &rt->rt_gateway);
1590 #endif
1591         }
1592 }
1593
1594 static int ip_error(struct sk_buff *skb)
1595 {
1596         struct rtable *rt = skb_rtable(skb);
1597         unsigned long now;
1598         int code;
1599
1600         switch (rt->u.dst.error) {
1601                 case EINVAL:
1602                 default:
1603                         goto out;
1604                 case EHOSTUNREACH:
1605                         code = ICMP_HOST_UNREACH;
1606                         break;
1607                 case ENETUNREACH:
1608                         code = ICMP_NET_UNREACH;
1609                         IP_INC_STATS_BH(dev_net(rt->u.dst.dev),
1610                                         IPSTATS_MIB_INNOROUTES);
1611                         break;
1612                 case EACCES:
1613                         code = ICMP_PKT_FILTERED;
1614                         break;
1615         }
1616
1617         now = jiffies;
1618         rt->u.dst.rate_tokens += now - rt->u.dst.rate_last;
1619         if (rt->u.dst.rate_tokens > ip_rt_error_burst)
1620                 rt->u.dst.rate_tokens = ip_rt_error_burst;
1621         rt->u.dst.rate_last = now;
1622         if (rt->u.dst.rate_tokens >= ip_rt_error_cost) {
1623                 rt->u.dst.rate_tokens -= ip_rt_error_cost;
1624                 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1625         }
1626
1627 out:    kfree_skb(skb);
1628         return 0;
1629 }
1630
1631 /*
1632  *      The last two values are not from the RFC but
1633  *      are needed for AMPRnet AX.25 paths.
1634  */
1635
1636 static const unsigned short mtu_plateau[] =
1637 {32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1638
1639 static inline unsigned short guess_mtu(unsigned short old_mtu)
1640 {
1641         int i;
1642
1643         for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
1644                 if (old_mtu > mtu_plateau[i])
1645                         return mtu_plateau[i];
1646         return 68;
1647 }
1648
1649 unsigned short ip_rt_frag_needed(struct net *net, struct iphdr *iph,
1650                                  unsigned short new_mtu,
1651                                  struct net_device *dev)
1652 {
1653         int i, k;
1654         unsigned short old_mtu = ntohs(iph->tot_len);
1655         struct rtable *rth;
1656         int  ikeys[2] = { dev->ifindex, 0 };
1657         __be32  skeys[2] = { iph->saddr, 0, };
1658         __be32  daddr = iph->daddr;
1659         unsigned short est_mtu = 0;
1660
1661         for (k = 0; k < 2; k++) {
1662                 for (i = 0; i < 2; i++) {
1663                         unsigned hash = rt_hash(daddr, skeys[i], ikeys[k],
1664                                                 rt_genid(net));
1665
1666                         rcu_read_lock();
1667                         for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
1668                              rth = rcu_dereference(rth->u.dst.rt_next)) {
1669                                 unsigned short mtu = new_mtu;
1670
1671                                 if (rth->fl.fl4_dst != daddr ||
1672                                     rth->fl.fl4_src != skeys[i] ||
1673                                     rth->rt_dst != daddr ||
1674                                     rth->rt_src != iph->saddr ||
1675                                     rth->fl.oif != ikeys[k] ||
1676                                     rth->fl.iif != 0 ||
1677                                     dst_metric_locked(&rth->u.dst, RTAX_MTU) ||
1678                                     !net_eq(dev_net(rth->u.dst.dev), net) ||
1679                                     rt_is_expired(rth))
1680                                         continue;
1681
1682                                 if (new_mtu < 68 || new_mtu >= old_mtu) {
1683
1684                                         /* BSD 4.2 compatibility hack :-( */
1685                                         if (mtu == 0 &&
1686                                             old_mtu >= dst_mtu(&rth->u.dst) &&
1687                                             old_mtu >= 68 + (iph->ihl << 2))
1688                                                 old_mtu -= iph->ihl << 2;
1689
1690                                         mtu = guess_mtu(old_mtu);
1691                                 }
1692                                 if (mtu <= dst_mtu(&rth->u.dst)) {
1693                                         if (mtu < dst_mtu(&rth->u.dst)) {
1694                                                 dst_confirm(&rth->u.dst);
1695                                                 if (mtu < ip_rt_min_pmtu) {
1696                                                         mtu = ip_rt_min_pmtu;
1697                                                         rth->u.dst.metrics[RTAX_LOCK-1] |=
1698                                                                 (1 << RTAX_MTU);
1699                                                 }
1700                                                 rth->u.dst.metrics[RTAX_MTU-1] = mtu;
1701                                                 dst_set_expires(&rth->u.dst,
1702                                                         ip_rt_mtu_expires);
1703                                         }
1704                                         est_mtu = mtu;
1705                                 }
1706                         }
1707                         rcu_read_unlock();
1708                 }
1709         }
1710         return est_mtu ? : new_mtu;
1711 }
1712
1713 static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1714 {
1715         if (dst_mtu(dst) > mtu && mtu >= 68 &&
1716             !(dst_metric_locked(dst, RTAX_MTU))) {
1717                 if (mtu < ip_rt_min_pmtu) {
1718                         mtu = ip_rt_min_pmtu;
1719                         dst->metrics[RTAX_LOCK-1] |= (1 << RTAX_MTU);
1720                 }
1721                 dst->metrics[RTAX_MTU-1] = mtu;
1722                 dst_set_expires(dst, ip_rt_mtu_expires);
1723                 call_netevent_notifiers(NETEVENT_PMTU_UPDATE, dst);
1724         }
1725 }
1726
1727 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1728 {
1729         return NULL;
1730 }
1731
1732 static void ipv4_dst_destroy(struct dst_entry *dst)
1733 {
1734         struct rtable *rt = (struct rtable *) dst;
1735         struct inet_peer *peer = rt->peer;
1736         struct in_device *idev = rt->idev;
1737
1738         if (peer) {
1739                 rt->peer = NULL;
1740                 inet_putpeer(peer);
1741         }
1742
1743         if (idev) {
1744                 rt->idev = NULL;
1745                 in_dev_put(idev);
1746         }
1747 }
1748
1749 static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
1750                             int how)
1751 {
1752         struct rtable *rt = (struct rtable *) dst;
1753         struct in_device *idev = rt->idev;
1754         if (dev != dev_net(dev)->loopback_dev && idev && idev->dev == dev) {
1755                 struct in_device *loopback_idev =
1756                         in_dev_get(dev_net(dev)->loopback_dev);
1757                 if (loopback_idev) {
1758                         rt->idev = loopback_idev;
1759                         in_dev_put(idev);
1760                 }
1761         }
1762 }
1763
1764 static void ipv4_link_failure(struct sk_buff *skb)
1765 {
1766         struct rtable *rt;
1767
1768         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1769
1770         rt = skb_rtable(skb);
1771         if (rt)
1772                 dst_set_expires(&rt->u.dst, 0);
1773 }
1774
1775 static int ip_rt_bug(struct sk_buff *skb)
1776 {
1777         printk(KERN_DEBUG "ip_rt_bug: %pI4 -> %pI4, %s\n",
1778                 &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1779                 skb->dev ? skb->dev->name : "?");
1780         kfree_skb(skb);
1781         return 0;
1782 }
1783
1784 /*
1785    We do not cache source address of outgoing interface,
1786    because it is used only by IP RR, TS and SRR options,
1787    so that it out of fast path.
1788
1789    BTW remember: "addr" is allowed to be not aligned
1790    in IP options!
1791  */
1792
1793 void ip_rt_get_source(u8 *addr, struct rtable *rt)
1794 {
1795         __be32 src;
1796         struct fib_result res;
1797
1798         if (rt->fl.iif == 0)
1799                 src = rt->rt_src;
1800         else if (fib_lookup(dev_net(rt->u.dst.dev), &rt->fl, &res) == 0) {
1801                 src = FIB_RES_PREFSRC(res);
1802                 fib_res_put(&res);
1803         } else
1804                 src = inet_select_addr(rt->u.dst.dev, rt->rt_gateway,
1805                                         RT_SCOPE_UNIVERSE);
1806         memcpy(addr, &src, 4);
1807 }
1808
1809 #ifdef CONFIG_NET_CLS_ROUTE
1810 static void set_class_tag(struct rtable *rt, u32 tag)
1811 {
1812         if (!(rt->u.dst.tclassid & 0xFFFF))
1813                 rt->u.dst.tclassid |= tag & 0xFFFF;
1814         if (!(rt->u.dst.tclassid & 0xFFFF0000))
1815                 rt->u.dst.tclassid |= tag & 0xFFFF0000;
1816 }
1817 #endif
1818
1819 static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag)
1820 {
1821         struct fib_info *fi = res->fi;
1822
1823         if (fi) {
1824                 if (FIB_RES_GW(*res) &&
1825                     FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1826                         rt->rt_gateway = FIB_RES_GW(*res);
1827                 memcpy(rt->u.dst.metrics, fi->fib_metrics,
1828                        sizeof(rt->u.dst.metrics));
1829                 if (fi->fib_mtu == 0) {
1830                         rt->u.dst.metrics[RTAX_MTU-1] = rt->u.dst.dev->mtu;
1831                         if (dst_metric_locked(&rt->u.dst, RTAX_MTU) &&
1832                             rt->rt_gateway != rt->rt_dst &&
1833                             rt->u.dst.dev->mtu > 576)
1834                                 rt->u.dst.metrics[RTAX_MTU-1] = 576;
1835                 }
1836 #ifdef CONFIG_NET_CLS_ROUTE
1837                 rt->u.dst.tclassid = FIB_RES_NH(*res).nh_tclassid;
1838 #endif
1839         } else
1840                 rt->u.dst.metrics[RTAX_MTU-1]= rt->u.dst.dev->mtu;
1841
1842         if (dst_metric(&rt->u.dst, RTAX_HOPLIMIT) == 0)
1843                 rt->u.dst.metrics[RTAX_HOPLIMIT-1] = sysctl_ip_default_ttl;
1844         if (dst_mtu(&rt->u.dst) > IP_MAX_MTU)
1845                 rt->u.dst.metrics[RTAX_MTU-1] = IP_MAX_MTU;
1846         if (dst_metric(&rt->u.dst, RTAX_ADVMSS) == 0)
1847                 rt->u.dst.metrics[RTAX_ADVMSS-1] = max_t(unsigned int, rt->u.dst.dev->mtu - 40,
1848                                        ip_rt_min_advmss);
1849         if (dst_metric(&rt->u.dst, RTAX_ADVMSS) > 65535 - 40)
1850                 rt->u.dst.metrics[RTAX_ADVMSS-1] = 65535 - 40;
1851
1852 #ifdef CONFIG_NET_CLS_ROUTE
1853 #ifdef CONFIG_IP_MULTIPLE_TABLES
1854         set_class_tag(rt, fib_rules_tclass(res));
1855 #endif
1856         set_class_tag(rt, itag);
1857 #endif
1858         rt->rt_type = res->type;
1859 }
1860
1861 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1862                                 u8 tos, struct net_device *dev, int our)
1863 {
1864         unsigned hash;
1865         struct rtable *rth;
1866         __be32 spec_dst;
1867         struct in_device *in_dev = in_dev_get(dev);
1868         u32 itag = 0;
1869
1870         /* Primary sanity checks. */
1871
1872         if (in_dev == NULL)
1873                 return -EINVAL;
1874
1875         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1876             ipv4_is_loopback(saddr) || skb->protocol != htons(ETH_P_IP))
1877                 goto e_inval;
1878
1879         if (ipv4_is_zeronet(saddr)) {
1880                 if (!ipv4_is_local_multicast(daddr))
1881                         goto e_inval;
1882                 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1883         } else if (fib_validate_source(saddr, 0, tos, 0,
1884                                         dev, &spec_dst, &itag, 0) < 0)
1885                 goto e_inval;
1886
1887         rth = dst_alloc(&ipv4_dst_ops);
1888         if (!rth)
1889                 goto e_nobufs;
1890
1891         rth->u.dst.output= ip_rt_bug;
1892
1893         atomic_set(&rth->u.dst.__refcnt, 1);
1894         rth->u.dst.flags= DST_HOST;
1895         if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
1896                 rth->u.dst.flags |= DST_NOPOLICY;
1897         rth->fl.fl4_dst = daddr;
1898         rth->rt_dst     = daddr;
1899         rth->fl.fl4_tos = tos;
1900         rth->fl.mark    = skb->mark;
1901         rth->fl.fl4_src = saddr;
1902         rth->rt_src     = saddr;
1903 #ifdef CONFIG_NET_CLS_ROUTE
1904         rth->u.dst.tclassid = itag;
1905 #endif
1906         rth->rt_iif     =
1907         rth->fl.iif     = dev->ifindex;
1908         rth->u.dst.dev  = init_net.loopback_dev;
1909         dev_hold(rth->u.dst.dev);
1910         rth->idev       = in_dev_get(rth->u.dst.dev);
1911         rth->fl.oif     = 0;
1912         rth->rt_gateway = daddr;
1913         rth->rt_spec_dst= spec_dst;
1914         rth->rt_genid   = rt_genid(dev_net(dev));
1915         rth->rt_flags   = RTCF_MULTICAST;
1916         rth->rt_type    = RTN_MULTICAST;
1917         if (our) {
1918                 rth->u.dst.input= ip_local_deliver;
1919                 rth->rt_flags |= RTCF_LOCAL;
1920         }
1921
1922 #ifdef CONFIG_IP_MROUTE
1923         if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1924                 rth->u.dst.input = ip_mr_input;
1925 #endif
1926         RT_CACHE_STAT_INC(in_slow_mc);
1927
1928         in_dev_put(in_dev);
1929         hash = rt_hash(daddr, saddr, dev->ifindex, rt_genid(dev_net(dev)));
1930         return rt_intern_hash(hash, rth, NULL, skb);
1931
1932 e_nobufs:
1933         in_dev_put(in_dev);
1934         return -ENOBUFS;
1935
1936 e_inval:
1937         in_dev_put(in_dev);
1938         return -EINVAL;
1939 }
1940
1941
1942 static void ip_handle_martian_source(struct net_device *dev,
1943                                      struct in_device *in_dev,
1944                                      struct sk_buff *skb,
1945                                      __be32 daddr,
1946                                      __be32 saddr)
1947 {
1948         RT_CACHE_STAT_INC(in_martian_src);
1949 #ifdef CONFIG_IP_ROUTE_VERBOSE
1950         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1951                 /*
1952                  *      RFC1812 recommendation, if source is martian,
1953                  *      the only hint is MAC header.
1954                  */
1955                 printk(KERN_WARNING "martian source %pI4 from %pI4, on dev %s\n",
1956                         &daddr, &saddr, dev->name);
1957                 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1958                         int i;
1959                         const unsigned char *p = skb_mac_header(skb);
1960                         printk(KERN_WARNING "ll header: ");
1961                         for (i = 0; i < dev->hard_header_len; i++, p++) {
1962                                 printk("%02x", *p);
1963                                 if (i < (dev->hard_header_len - 1))
1964                                         printk(":");
1965                         }
1966                         printk("\n");
1967                 }
1968         }
1969 #endif
1970 }
1971
1972 static int __mkroute_input(struct sk_buff *skb,
1973                            struct fib_result *res,
1974                            struct in_device *in_dev,
1975                            __be32 daddr, __be32 saddr, u32 tos,
1976                            struct rtable **result)
1977 {
1978
1979         struct rtable *rth;
1980         int err;
1981         struct in_device *out_dev;
1982         unsigned flags = 0;
1983         __be32 spec_dst;
1984         u32 itag;
1985
1986         /* get a working reference to the output device */
1987         out_dev = in_dev_get(FIB_RES_DEV(*res));
1988         if (out_dev == NULL) {
1989                 if (net_ratelimit())
1990                         printk(KERN_CRIT "Bug in ip_route_input" \
1991                                "_slow(). Please, report\n");
1992                 return -EINVAL;
1993         }
1994
1995
1996         err = fib_validate_source(saddr, daddr, tos, FIB_RES_OIF(*res),
1997                                   in_dev->dev, &spec_dst, &itag, skb->mark);
1998         if (err < 0) {
1999                 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
2000                                          saddr);
2001
2002                 err = -EINVAL;
2003                 goto cleanup;
2004         }
2005
2006         if (err)
2007                 flags |= RTCF_DIRECTSRC;
2008
2009         if (out_dev == in_dev && err &&
2010             (IN_DEV_SHARED_MEDIA(out_dev) ||
2011              inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
2012                 flags |= RTCF_DOREDIRECT;
2013
2014         if (skb->protocol != htons(ETH_P_IP)) {
2015                 /* Not IP (i.e. ARP). Do not create route, if it is
2016                  * invalid for proxy arp. DNAT routes are always valid.
2017                  *
2018                  * Proxy arp feature have been extended to allow, ARP
2019                  * replies back to the same interface, to support
2020                  * Private VLAN switch technologies. See arp.c.
2021                  */
2022                 if (out_dev == in_dev &&
2023                     IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
2024                         err = -EINVAL;
2025                         goto cleanup;
2026                 }
2027         }
2028
2029
2030         rth = dst_alloc(&ipv4_dst_ops);
2031         if (!rth) {
2032                 err = -ENOBUFS;
2033                 goto cleanup;
2034         }
2035
2036         atomic_set(&rth->u.dst.__refcnt, 1);
2037         rth->u.dst.flags= DST_HOST;
2038         if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
2039                 rth->u.dst.flags |= DST_NOPOLICY;
2040         if (IN_DEV_CONF_GET(out_dev, NOXFRM))
2041                 rth->u.dst.flags |= DST_NOXFRM;
2042         rth->fl.fl4_dst = daddr;
2043         rth->rt_dst     = daddr;
2044         rth->fl.fl4_tos = tos;
2045         rth->fl.mark    = skb->mark;
2046         rth->fl.fl4_src = saddr;
2047         rth->rt_src     = saddr;
2048         rth->rt_gateway = daddr;
2049         rth->rt_iif     =
2050                 rth->fl.iif     = in_dev->dev->ifindex;
2051         rth->u.dst.dev  = (out_dev)->dev;
2052         dev_hold(rth->u.dst.dev);
2053         rth->idev       = in_dev_get(rth->u.dst.dev);
2054         rth->fl.oif     = 0;
2055         rth->rt_spec_dst= spec_dst;
2056
2057         rth->u.dst.input = ip_forward;
2058         rth->u.dst.output = ip_output;
2059         rth->rt_genid = rt_genid(dev_net(rth->u.dst.dev));
2060
2061         rt_set_nexthop(rth, res, itag);
2062
2063         rth->rt_flags = flags;
2064
2065         *result = rth;
2066         err = 0;
2067  cleanup:
2068         /* release the working reference to the output device */
2069         in_dev_put(out_dev);
2070         return err;
2071 }
2072
2073 static int ip_mkroute_input(struct sk_buff *skb,
2074                             struct fib_result *res,
2075                             const struct flowi *fl,
2076                             struct in_device *in_dev,
2077                             __be32 daddr, __be32 saddr, u32 tos)
2078 {
2079         struct rtable* rth = NULL;
2080         int err;
2081         unsigned hash;
2082
2083 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2084         if (res->fi && res->fi->fib_nhs > 1 && fl->oif == 0)
2085                 fib_select_multipath(fl, res);
2086 #endif
2087
2088         /* create a routing cache entry */
2089         err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
2090         if (err)
2091                 return err;
2092
2093         /* put it into the cache */
2094         hash = rt_hash(daddr, saddr, fl->iif,
2095                        rt_genid(dev_net(rth->u.dst.dev)));
2096         return rt_intern_hash(hash, rth, NULL, skb);
2097 }
2098
2099 /*
2100  *      NOTE. We drop all the packets that has local source
2101  *      addresses, because every properly looped back packet
2102  *      must have correct destination already attached by output routine.
2103  *
2104  *      Such approach solves two big problems:
2105  *      1. Not simplex devices are handled properly.
2106  *      2. IP spoofing attempts are filtered with 100% of guarantee.
2107  */
2108
2109 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2110                                u8 tos, struct net_device *dev)
2111 {
2112         struct fib_result res;
2113         struct in_device *in_dev = in_dev_get(dev);
2114         struct flowi fl = { .nl_u = { .ip4_u =
2115                                       { .daddr = daddr,
2116                                         .saddr = saddr,
2117                                         .tos = tos,
2118                                         .scope = RT_SCOPE_UNIVERSE,
2119                                       } },
2120                             .mark = skb->mark,
2121                             .iif = dev->ifindex };
2122         unsigned        flags = 0;
2123         u32             itag = 0;
2124         struct rtable * rth;
2125         unsigned        hash;
2126         __be32          spec_dst;
2127         int             err = -EINVAL;
2128         int             free_res = 0;
2129         struct net    * net = dev_net(dev);
2130
2131         /* IP on this device is disabled. */
2132
2133         if (!in_dev)
2134                 goto out;
2135
2136         /* Check for the most weird martians, which can be not detected
2137            by fib_lookup.
2138          */
2139
2140         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
2141             ipv4_is_loopback(saddr))
2142                 goto martian_source;
2143
2144         if (daddr == htonl(0xFFFFFFFF) || (saddr == 0 && daddr == 0))
2145                 goto brd_input;
2146
2147         /* Accept zero addresses only to limited broadcast;
2148          * I even do not know to fix it or not. Waiting for complains :-)
2149          */
2150         if (ipv4_is_zeronet(saddr))
2151                 goto martian_source;
2152
2153         if (ipv4_is_lbcast(daddr) || ipv4_is_zeronet(daddr) ||
2154             ipv4_is_loopback(daddr))
2155                 goto martian_destination;
2156
2157         /*
2158          *      Now we are ready to route packet.
2159          */
2160         if ((err = fib_lookup(net, &fl, &res)) != 0) {
2161                 if (!IN_DEV_FORWARD(in_dev))
2162                         goto e_hostunreach;
2163                 goto no_route;
2164         }
2165         free_res = 1;
2166
2167         RT_CACHE_STAT_INC(in_slow_tot);
2168
2169         if (res.type == RTN_BROADCAST)
2170                 goto brd_input;
2171
2172         if (res.type == RTN_LOCAL) {
2173                 int result;
2174                 result = fib_validate_source(saddr, daddr, tos,
2175                                              net->loopback_dev->ifindex,
2176                                              dev, &spec_dst, &itag, skb->mark);
2177                 if (result < 0)
2178                         goto martian_source;
2179                 if (result)
2180                         flags |= RTCF_DIRECTSRC;
2181                 spec_dst = daddr;
2182                 goto local_input;
2183         }
2184
2185         if (!IN_DEV_FORWARD(in_dev))
2186                 goto e_hostunreach;
2187         if (res.type != RTN_UNICAST)
2188                 goto martian_destination;
2189
2190         err = ip_mkroute_input(skb, &res, &fl, in_dev, daddr, saddr, tos);
2191 done:
2192         in_dev_put(in_dev);
2193         if (free_res)
2194                 fib_res_put(&res);
2195 out:    return err;
2196
2197 brd_input:
2198         if (skb->protocol != htons(ETH_P_IP))
2199                 goto e_inval;
2200
2201         if (ipv4_is_zeronet(saddr))
2202                 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
2203         else {
2204                 err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst,
2205                                           &itag, skb->mark);
2206                 if (err < 0)
2207                         goto martian_source;
2208                 if (err)
2209                         flags |= RTCF_DIRECTSRC;
2210         }
2211         flags |= RTCF_BROADCAST;
2212         res.type = RTN_BROADCAST;
2213         RT_CACHE_STAT_INC(in_brd);
2214
2215 local_input:
2216         rth = dst_alloc(&ipv4_dst_ops);
2217         if (!rth)
2218                 goto e_nobufs;
2219
2220         rth->u.dst.output= ip_rt_bug;
2221         rth->rt_genid = rt_genid(net);
2222
2223         atomic_set(&rth->u.dst.__refcnt, 1);
2224         rth->u.dst.flags= DST_HOST;
2225         if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
2226                 rth->u.dst.flags |= DST_NOPOLICY;
2227         rth->fl.fl4_dst = daddr;
2228         rth->rt_dst     = daddr;
2229         rth->fl.fl4_tos = tos;
2230         rth->fl.mark    = skb->mark;
2231         rth->fl.fl4_src = saddr;
2232         rth->rt_src     = saddr;
2233 #ifdef CONFIG_NET_CLS_ROUTE
2234         rth->u.dst.tclassid = itag;
2235 #endif
2236         rth->rt_iif     =
2237         rth->fl.iif     = dev->ifindex;
2238         rth->u.dst.dev  = net->loopback_dev;
2239         dev_hold(rth->u.dst.dev);
2240         rth->idev       = in_dev_get(rth->u.dst.dev);
2241         rth->rt_gateway = daddr;
2242         rth->rt_spec_dst= spec_dst;
2243         rth->u.dst.input= ip_local_deliver;
2244         rth->rt_flags   = flags|RTCF_LOCAL;
2245         if (res.type == RTN_UNREACHABLE) {
2246                 rth->u.dst.input= ip_error;
2247                 rth->u.dst.error= -err;
2248                 rth->rt_flags   &= ~RTCF_LOCAL;
2249         }
2250         rth->rt_type    = res.type;
2251         hash = rt_hash(daddr, saddr, fl.iif, rt_genid(net));
2252         err = rt_intern_hash(hash, rth, NULL, skb);
2253         goto done;
2254
2255 no_route:
2256         RT_CACHE_STAT_INC(in_no_route);
2257         spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
2258         res.type = RTN_UNREACHABLE;
2259         if (err == -ESRCH)
2260                 err = -ENETUNREACH;
2261         goto local_input;
2262
2263         /*
2264          *      Do not cache martian addresses: they should be logged (RFC1812)
2265          */
2266 martian_destination:
2267         RT_CACHE_STAT_INC(in_martian_dst);
2268 #ifdef CONFIG_IP_ROUTE_VERBOSE
2269         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
2270                 printk(KERN_WARNING "martian destination %pI4 from %pI4, dev %s\n",
2271                         &daddr, &saddr, dev->name);
2272 #endif
2273
2274 e_hostunreach:
2275         err = -EHOSTUNREACH;
2276         goto done;
2277
2278 e_inval:
2279         err = -EINVAL;
2280         goto done;
2281
2282 e_nobufs:
2283         err = -ENOBUFS;
2284         goto done;
2285
2286 martian_source:
2287         ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2288         goto e_inval;
2289 }
2290
2291 int ip_route_input(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2292                    u8 tos, struct net_device *dev)
2293 {
2294         struct rtable * rth;
2295         unsigned        hash;
2296         int iif = dev->ifindex;
2297         struct net *net;
2298
2299         net = dev_net(dev);
2300
2301         if (!rt_caching(net))
2302                 goto skip_cache;
2303
2304         tos &= IPTOS_RT_MASK;
2305         hash = rt_hash(daddr, saddr, iif, rt_genid(net));
2306
2307         rcu_read_lock();
2308         for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2309              rth = rcu_dereference(rth->u.dst.rt_next)) {
2310                 if (((rth->fl.fl4_dst ^ daddr) |
2311                      (rth->fl.fl4_src ^ saddr) |
2312                      (rth->fl.iif ^ iif) |
2313                      rth->fl.oif |
2314                      (rth->fl.fl4_tos ^ tos)) == 0 &&
2315                     rth->fl.mark == skb->mark &&
2316                     net_eq(dev_net(rth->u.dst.dev), net) &&
2317                     !rt_is_expired(rth)) {
2318                         dst_use(&rth->u.dst, jiffies);
2319                         RT_CACHE_STAT_INC(in_hit);
2320                         rcu_read_unlock();
2321                         skb_dst_set(skb, &rth->u.dst);
2322                         return 0;
2323                 }
2324                 RT_CACHE_STAT_INC(in_hlist_search);
2325         }
2326         rcu_read_unlock();
2327
2328 skip_cache:
2329         /* Multicast recognition logic is moved from route cache to here.
2330            The problem was that too many Ethernet cards have broken/missing
2331            hardware multicast filters :-( As result the host on multicasting
2332            network acquires a lot of useless route cache entries, sort of
2333            SDR messages from all the world. Now we try to get rid of them.
2334            Really, provided software IP multicast filter is organized
2335            reasonably (at least, hashed), it does not result in a slowdown
2336            comparing with route cache reject entries.
2337            Note, that multicast routers are not affected, because
2338            route cache entry is created eventually.
2339          */
2340         if (ipv4_is_multicast(daddr)) {
2341                 struct in_device *in_dev;
2342
2343                 rcu_read_lock();
2344                 if ((in_dev = __in_dev_get_rcu(dev)) != NULL) {
2345                         int our = ip_check_mc(in_dev, daddr, saddr,
2346                                 ip_hdr(skb)->protocol);
2347                         if (our
2348 #ifdef CONFIG_IP_MROUTE
2349                                 ||
2350                             (!ipv4_is_local_multicast(daddr) &&
2351                              IN_DEV_MFORWARD(in_dev))
2352 #endif
2353                            ) {
2354                                 rcu_read_unlock();
2355                                 return ip_route_input_mc(skb, daddr, saddr,
2356                                                          tos, dev, our);
2357                         }
2358                 }
2359                 rcu_read_unlock();
2360                 return -EINVAL;
2361         }
2362         return ip_route_input_slow(skb, daddr, saddr, tos, dev);
2363 }
2364
2365 static int __mkroute_output(struct rtable **result,
2366                             struct fib_result *res,
2367                             const struct flowi *fl,
2368                             const struct flowi *oldflp,
2369                             struct net_device *dev_out,
2370                             unsigned flags)
2371 {
2372         struct rtable *rth;
2373         struct in_device *in_dev;
2374         u32 tos = RT_FL_TOS(oldflp);
2375         int err = 0;
2376
2377         if (ipv4_is_loopback(fl->fl4_src) && !(dev_out->flags&IFF_LOOPBACK))
2378                 return -EINVAL;
2379
2380         if (fl->fl4_dst == htonl(0xFFFFFFFF))
2381                 res->type = RTN_BROADCAST;
2382         else if (ipv4_is_multicast(fl->fl4_dst))
2383                 res->type = RTN_MULTICAST;
2384         else if (ipv4_is_lbcast(fl->fl4_dst) || ipv4_is_zeronet(fl->fl4_dst))
2385                 return -EINVAL;
2386
2387         if (dev_out->flags & IFF_LOOPBACK)
2388                 flags |= RTCF_LOCAL;
2389
2390         /* get work reference to inet device */
2391         in_dev = in_dev_get(dev_out);
2392         if (!in_dev)
2393                 return -EINVAL;
2394
2395         if (res->type == RTN_BROADCAST) {
2396                 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2397                 if (res->fi) {
2398                         fib_info_put(res->fi);
2399                         res->fi = NULL;
2400                 }
2401         } else if (res->type == RTN_MULTICAST) {
2402                 flags |= RTCF_MULTICAST|RTCF_LOCAL;
2403                 if (!ip_check_mc(in_dev, oldflp->fl4_dst, oldflp->fl4_src,
2404                                  oldflp->proto))
2405                         flags &= ~RTCF_LOCAL;
2406                 /* If multicast route do not exist use
2407                    default one, but do not gateway in this case.
2408                    Yes, it is hack.
2409                  */
2410                 if (res->fi && res->prefixlen < 4) {
2411                         fib_info_put(res->fi);
2412                         res->fi = NULL;
2413                 }
2414         }
2415
2416
2417         rth = dst_alloc(&ipv4_dst_ops);
2418         if (!rth) {
2419                 err = -ENOBUFS;
2420                 goto cleanup;
2421         }
2422
2423         atomic_set(&rth->u.dst.__refcnt, 1);
2424         rth->u.dst.flags= DST_HOST;
2425         if (IN_DEV_CONF_GET(in_dev, NOXFRM))
2426                 rth->u.dst.flags |= DST_NOXFRM;
2427         if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
2428                 rth->u.dst.flags |= DST_NOPOLICY;
2429
2430         rth->fl.fl4_dst = oldflp->fl4_dst;
2431         rth->fl.fl4_tos = tos;
2432         rth->fl.fl4_src = oldflp->fl4_src;
2433         rth->fl.oif     = oldflp->oif;
2434         rth->fl.mark    = oldflp->mark;
2435         rth->rt_dst     = fl->fl4_dst;
2436         rth->rt_src     = fl->fl4_src;
2437         rth->rt_iif     = oldflp->oif ? : dev_out->ifindex;
2438         /* get references to the devices that are to be hold by the routing
2439            cache entry */
2440         rth->u.dst.dev  = dev_out;
2441         dev_hold(dev_out);
2442         rth->idev       = in_dev_get(dev_out);
2443         rth->rt_gateway = fl->fl4_dst;
2444         rth->rt_spec_dst= fl->fl4_src;
2445
2446         rth->u.dst.output=ip_output;
2447         rth->rt_genid = rt_genid(dev_net(dev_out));
2448
2449         RT_CACHE_STAT_INC(out_slow_tot);
2450
2451         if (flags & RTCF_LOCAL) {
2452                 rth->u.dst.input = ip_local_deliver;
2453                 rth->rt_spec_dst = fl->fl4_dst;
2454         }
2455         if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2456                 rth->rt_spec_dst = fl->fl4_src;
2457                 if (flags & RTCF_LOCAL &&
2458                     !(dev_out->flags & IFF_LOOPBACK)) {
2459                         rth->u.dst.output = ip_mc_output;
2460                         RT_CACHE_STAT_INC(out_slow_mc);
2461                 }
2462 #ifdef CONFIG_IP_MROUTE
2463                 if (res->type == RTN_MULTICAST) {
2464                         if (IN_DEV_MFORWARD(in_dev) &&
2465                             !ipv4_is_local_multicast(oldflp->fl4_dst)) {
2466                                 rth->u.dst.input = ip_mr_input;
2467                                 rth->u.dst.output = ip_mc_output;
2468                         }
2469                 }
2470 #endif
2471         }
2472
2473         rt_set_nexthop(rth, res, 0);
2474
2475         rth->rt_flags = flags;
2476
2477         *result = rth;
2478  cleanup:
2479         /* release work reference to inet device */
2480         in_dev_put(in_dev);
2481
2482         return err;
2483 }
2484
2485 static int ip_mkroute_output(struct rtable **rp,
2486                              struct fib_result *res,
2487                              const struct flowi *fl,
2488                              const struct flowi *oldflp,
2489                              struct net_device *dev_out,
2490                              unsigned flags)
2491 {
2492         struct rtable *rth = NULL;
2493         int err = __mkroute_output(&rth, res, fl, oldflp, dev_out, flags);
2494         unsigned hash;
2495         if (err == 0) {
2496                 hash = rt_hash(oldflp->fl4_dst, oldflp->fl4_src, oldflp->oif,
2497                                rt_genid(dev_net(dev_out)));
2498                 err = rt_intern_hash(hash, rth, rp, NULL);
2499         }
2500
2501         return err;
2502 }
2503
2504 /*
2505  * Major route resolver routine.
2506  */
2507
2508 static int ip_route_output_slow(struct net *net, struct rtable **rp,
2509                                 const struct flowi *oldflp)
2510 {
2511         u32 tos = RT_FL_TOS(oldflp);
2512         struct flowi fl = { .nl_u = { .ip4_u =
2513                                       { .daddr = oldflp->fl4_dst,
2514                                         .saddr = oldflp->fl4_src,
2515                                         .tos = tos & IPTOS_RT_MASK,
2516                                         .scope = ((tos & RTO_ONLINK) ?
2517                                                   RT_SCOPE_LINK :
2518                                                   RT_SCOPE_UNIVERSE),
2519                                       } },
2520                             .mark = oldflp->mark,
2521                             .iif = net->loopback_dev->ifindex,
2522                             .oif = oldflp->oif };
2523         struct fib_result res;
2524         unsigned flags = 0;
2525         struct net_device *dev_out = NULL;
2526         int free_res = 0;
2527         int err;
2528
2529
2530         res.fi          = NULL;
2531 #ifdef CONFIG_IP_MULTIPLE_TABLES
2532         res.r           = NULL;
2533 #endif
2534
2535         if (oldflp->fl4_src) {
2536                 err = -EINVAL;
2537                 if (ipv4_is_multicast(oldflp->fl4_src) ||
2538                     ipv4_is_lbcast(oldflp->fl4_src) ||
2539                     ipv4_is_zeronet(oldflp->fl4_src))
2540                         goto out;
2541
2542                 /* I removed check for oif == dev_out->oif here.
2543                    It was wrong for two reasons:
2544                    1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2545                       is assigned to multiple interfaces.
2546                    2. Moreover, we are allowed to send packets with saddr
2547                       of another iface. --ANK
2548                  */
2549
2550                 if (oldflp->oif == 0 &&
2551                     (ipv4_is_multicast(oldflp->fl4_dst) ||
2552                      oldflp->fl4_dst == htonl(0xFFFFFFFF))) {
2553                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2554                         dev_out = ip_dev_find(net, oldflp->fl4_src);
2555                         if (dev_out == NULL)
2556                                 goto out;
2557
2558                         /* Special hack: user can direct multicasts
2559                            and limited broadcast via necessary interface
2560                            without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2561                            This hack is not just for fun, it allows
2562                            vic,vat and friends to work.
2563                            They bind socket to loopback, set ttl to zero
2564                            and expect that it will work.
2565                            From the viewpoint of routing cache they are broken,
2566                            because we are not allowed to build multicast path
2567                            with loopback source addr (look, routing cache
2568                            cannot know, that ttl is zero, so that packet
2569                            will not leave this host and route is valid).
2570                            Luckily, this hack is good workaround.
2571                          */
2572
2573                         fl.oif = dev_out->ifindex;
2574                         goto make_route;
2575                 }
2576
2577                 if (!(oldflp->flags & FLOWI_FLAG_ANYSRC)) {
2578                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2579                         dev_out = ip_dev_find(net, oldflp->fl4_src);
2580                         if (dev_out == NULL)
2581                                 goto out;
2582                         dev_put(dev_out);
2583                         dev_out = NULL;
2584                 }
2585         }
2586
2587
2588         if (oldflp->oif) {
2589                 dev_out = dev_get_by_index(net, oldflp->oif);
2590                 err = -ENODEV;
2591                 if (dev_out == NULL)
2592                         goto out;
2593
2594                 /* RACE: Check return value of inet_select_addr instead. */
2595                 if (__in_dev_get_rtnl(dev_out) == NULL) {
2596                         dev_put(dev_out);
2597                         goto out;       /* Wrong error code */
2598                 }
2599
2600                 if (ipv4_is_local_multicast(oldflp->fl4_dst) ||
2601                     oldflp->fl4_dst == htonl(0xFFFFFFFF)) {
2602                         if (!fl.fl4_src)
2603                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2604                                                               RT_SCOPE_LINK);
2605                         goto make_route;
2606                 }
2607                 if (!fl.fl4_src) {
2608                         if (ipv4_is_multicast(oldflp->fl4_dst))
2609                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2610                                                               fl.fl4_scope);
2611                         else if (!oldflp->fl4_dst)
2612                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2613                                                               RT_SCOPE_HOST);
2614                 }
2615         }
2616
2617         if (!fl.fl4_dst) {
2618                 fl.fl4_dst = fl.fl4_src;
2619                 if (!fl.fl4_dst)
2620                         fl.fl4_dst = fl.fl4_src = htonl(INADDR_LOOPBACK);
2621                 if (dev_out)
2622                         dev_put(dev_out);
2623                 dev_out = net->loopback_dev;
2624                 dev_hold(dev_out);
2625                 fl.oif = net->loopback_dev->ifindex;
2626                 res.type = RTN_LOCAL;
2627                 flags |= RTCF_LOCAL;
2628                 goto make_route;
2629         }
2630
2631         if (fib_lookup(net, &fl, &res)) {
2632                 res.fi = NULL;
2633                 if (oldflp->oif) {
2634                         /* Apparently, routing tables are wrong. Assume,
2635                            that the destination is on link.
2636
2637                            WHY? DW.
2638                            Because we are allowed to send to iface
2639                            even if it has NO routes and NO assigned
2640                            addresses. When oif is specified, routing
2641                            tables are looked up with only one purpose:
2642                            to catch if destination is gatewayed, rather than
2643                            direct. Moreover, if MSG_DONTROUTE is set,
2644                            we send packet, ignoring both routing tables
2645                            and ifaddr state. --ANK
2646
2647
2648                            We could make it even if oif is unknown,
2649                            likely IPv6, but we do not.
2650                          */
2651
2652                         if (fl.fl4_src == 0)
2653                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2654                                                               RT_SCOPE_LINK);
2655                         res.type = RTN_UNICAST;
2656                         goto make_route;
2657                 }
2658                 if (dev_out)
2659                         dev_put(dev_out);
2660                 err = -ENETUNREACH;
2661                 goto out;
2662         }
2663         free_res = 1;
2664
2665         if (res.type == RTN_LOCAL) {
2666                 if (!fl.fl4_src)
2667                         fl.fl4_src = fl.fl4_dst;
2668                 if (dev_out)
2669                         dev_put(dev_out);
2670                 dev_out = net->loopback_dev;
2671                 dev_hold(dev_out);
2672                 fl.oif = dev_out->ifindex;
2673                 if (res.fi)
2674                         fib_info_put(res.fi);
2675                 res.fi = NULL;
2676                 flags |= RTCF_LOCAL;
2677                 goto make_route;
2678         }
2679
2680 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2681         if (res.fi->fib_nhs > 1 && fl.oif == 0)
2682                 fib_select_multipath(&fl, &res);
2683         else
2684 #endif
2685         if (!res.prefixlen && res.type == RTN_UNICAST && !fl.oif)
2686                 fib_select_default(net, &fl, &res);
2687
2688         if (!fl.fl4_src)
2689                 fl.fl4_src = FIB_RES_PREFSRC(res);
2690
2691         if (dev_out)
2692                 dev_put(dev_out);
2693         dev_out = FIB_RES_DEV(res);
2694         dev_hold(dev_out);
2695         fl.oif = dev_out->ifindex;
2696
2697
2698 make_route:
2699         err = ip_mkroute_output(rp, &res, &fl, oldflp, dev_out, flags);
2700
2701
2702         if (free_res)
2703                 fib_res_put(&res);
2704         if (dev_out)
2705                 dev_put(dev_out);
2706 out:    return err;
2707 }
2708
2709 int __ip_route_output_key(struct net *net, struct rtable **rp,
2710                           const struct flowi *flp)
2711 {
2712         unsigned hash;
2713         struct rtable *rth;
2714
2715         if (!rt_caching(net))
2716                 goto slow_output;
2717
2718         hash = rt_hash(flp->fl4_dst, flp->fl4_src, flp->oif, rt_genid(net));
2719
2720         rcu_read_lock_bh();
2721         for (rth = rcu_dereference_bh(rt_hash_table[hash].chain); rth;
2722                 rth = rcu_dereference_bh(rth->u.dst.rt_next)) {
2723                 if (rth->fl.fl4_dst == flp->fl4_dst &&
2724                     rth->fl.fl4_src == flp->fl4_src &&
2725                     rth->fl.iif == 0 &&
2726                     rth->fl.oif == flp->oif &&
2727                     rth->fl.mark == flp->mark &&
2728                     !((rth->fl.fl4_tos ^ flp->fl4_tos) &
2729                             (IPTOS_RT_MASK | RTO_ONLINK)) &&
2730                     net_eq(dev_net(rth->u.dst.dev), net) &&
2731                     !rt_is_expired(rth)) {
2732                         dst_use(&rth->u.dst, jiffies);
2733                         RT_CACHE_STAT_INC(out_hit);
2734                         rcu_read_unlock_bh();
2735                         *rp = rth;
2736                         return 0;
2737                 }
2738                 RT_CACHE_STAT_INC(out_hlist_search);
2739         }
2740         rcu_read_unlock_bh();
2741
2742 slow_output:
2743         return ip_route_output_slow(net, rp, flp);
2744 }
2745
2746 EXPORT_SYMBOL_GPL(__ip_route_output_key);
2747
2748 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
2749 {
2750 }
2751
2752 static struct dst_ops ipv4_dst_blackhole_ops = {
2753         .family                 =       AF_INET,
2754         .protocol               =       cpu_to_be16(ETH_P_IP),
2755         .destroy                =       ipv4_dst_destroy,
2756         .check                  =       ipv4_dst_check,
2757         .update_pmtu            =       ipv4_rt_blackhole_update_pmtu,
2758         .entries                =       ATOMIC_INIT(0),
2759 };
2760
2761
2762 static int ipv4_dst_blackhole(struct net *net, struct rtable **rp, struct flowi *flp)
2763 {
2764         struct rtable *ort = *rp;
2765         struct rtable *rt = (struct rtable *)
2766                 dst_alloc(&ipv4_dst_blackhole_ops);
2767
2768         if (rt) {
2769                 struct dst_entry *new = &rt->u.dst;
2770
2771                 atomic_set(&new->__refcnt, 1);
2772                 new->__use = 1;
2773                 new->input = dst_discard;
2774                 new->output = dst_discard;
2775                 memcpy(new->metrics, ort->u.dst.metrics, RTAX_MAX*sizeof(u32));
2776
2777                 new->dev = ort->u.dst.dev;
2778                 if (new->dev)
2779                         dev_hold(new->dev);
2780
2781                 rt->fl = ort->fl;
2782
2783                 rt->idev = ort->idev;
2784                 if (rt->idev)
2785                         in_dev_hold(rt->idev);
2786                 rt->rt_genid = rt_genid(net);
2787                 rt->rt_flags = ort->rt_flags;
2788                 rt->rt_type = ort->rt_type;
2789                 rt->rt_dst = ort->rt_dst;
2790                 rt->rt_src = ort->rt_src;
2791                 rt->rt_iif = ort->rt_iif;
2792                 rt->rt_gateway = ort->rt_gateway;
2793                 rt->rt_spec_dst = ort->rt_spec_dst;
2794                 rt->peer = ort->peer;
2795                 if (rt->peer)
2796                         atomic_inc(&rt->peer->refcnt);
2797
2798                 dst_free(new);
2799         }
2800
2801         dst_release(&(*rp)->u.dst);
2802         *rp = rt;
2803         return (rt ? 0 : -ENOMEM);
2804 }
2805
2806 int ip_route_output_flow(struct net *net, struct rtable **rp, struct flowi *flp,
2807                          struct sock *sk, int flags)
2808 {
2809         int err;
2810
2811         if ((err = __ip_route_output_key(net, rp, flp)) != 0)
2812                 return err;
2813
2814         if (flp->proto) {
2815                 if (!flp->fl4_src)
2816                         flp->fl4_src = (*rp)->rt_src;
2817                 if (!flp->fl4_dst)
2818                         flp->fl4_dst = (*rp)->rt_dst;
2819                 err = __xfrm_lookup(net, (struct dst_entry **)rp, flp, sk,
2820                                     flags ? XFRM_LOOKUP_WAIT : 0);
2821                 if (err == -EREMOTE)
2822                         err = ipv4_dst_blackhole(net, rp, flp);
2823
2824                 return err;
2825         }
2826
2827         return 0;
2828 }
2829
2830 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2831
2832 int ip_route_output_key(struct net *net, struct rtable **rp, struct flowi *flp)
2833 {
2834         return ip_route_output_flow(net, rp, flp, NULL, 0);
2835 }
2836
2837 static int rt_fill_info(struct net *net,
2838                         struct sk_buff *skb, u32 pid, u32 seq, int event,
2839                         int nowait, unsigned int flags)
2840 {
2841         struct rtable *rt = skb_rtable(skb);
2842         struct rtmsg *r;
2843         struct nlmsghdr *nlh;
2844         long expires;
2845         u32 id = 0, ts = 0, tsage = 0, error;
2846
2847         nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
2848         if (nlh == NULL)
2849                 return -EMSGSIZE;
2850
2851         r = nlmsg_data(nlh);
2852         r->rtm_family    = AF_INET;
2853         r->rtm_dst_len  = 32;
2854         r->rtm_src_len  = 0;
2855         r->rtm_tos      = rt->fl.fl4_tos;
2856         r->rtm_table    = RT_TABLE_MAIN;
2857         NLA_PUT_U32(skb, RTA_TABLE, RT_TABLE_MAIN);
2858         r->rtm_type     = rt->rt_type;
2859         r->rtm_scope    = RT_SCOPE_UNIVERSE;
2860         r->rtm_protocol = RTPROT_UNSPEC;
2861         r->rtm_flags    = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2862         if (rt->rt_flags & RTCF_NOTIFY)
2863                 r->rtm_flags |= RTM_F_NOTIFY;
2864
2865         NLA_PUT_BE32(skb, RTA_DST, rt->rt_dst);
2866
2867         if (rt->fl.fl4_src) {
2868                 r->rtm_src_len = 32;
2869                 NLA_PUT_BE32(skb, RTA_SRC, rt->fl.fl4_src);
2870         }
2871         if (rt->u.dst.dev)
2872                 NLA_PUT_U32(skb, RTA_OIF, rt->u.dst.dev->ifindex);
2873 #ifdef CONFIG_NET_CLS_ROUTE
2874         if (rt->u.dst.tclassid)
2875                 NLA_PUT_U32(skb, RTA_FLOW, rt->u.dst.tclassid);
2876 #endif
2877         if (rt->fl.iif)
2878                 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_spec_dst);
2879         else if (rt->rt_src != rt->fl.fl4_src)
2880                 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_src);
2881
2882         if (rt->rt_dst != rt->rt_gateway)
2883                 NLA_PUT_BE32(skb, RTA_GATEWAY, rt->rt_gateway);
2884
2885         if (rtnetlink_put_metrics(skb, rt->u.dst.metrics) < 0)
2886                 goto nla_put_failure;
2887
2888         error = rt->u.dst.error;
2889         expires = rt->u.dst.expires ? rt->u.dst.expires - jiffies : 0;
2890         if (rt->peer) {
2891                 id = atomic_read(&rt->peer->ip_id_count) & 0xffff;
2892                 if (rt->peer->tcp_ts_stamp) {
2893                         ts = rt->peer->tcp_ts;
2894                         tsage = get_seconds() - rt->peer->tcp_ts_stamp;
2895                 }
2896         }
2897
2898         if (rt->fl.iif) {
2899 #ifdef CONFIG_IP_MROUTE
2900                 __be32 dst = rt->rt_dst;
2901
2902                 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
2903                     IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2904                         int err = ipmr_get_route(net, skb, r, nowait);
2905                         if (err <= 0) {
2906                                 if (!nowait) {
2907                                         if (err == 0)
2908                                                 return 0;
2909                                         goto nla_put_failure;
2910                                 } else {
2911                                         if (err == -EMSGSIZE)
2912                                                 goto nla_put_failure;
2913                                         error = err;
2914                                 }
2915                         }
2916                 } else
2917 #endif
2918                         NLA_PUT_U32(skb, RTA_IIF, rt->fl.iif);
2919         }
2920
2921         if (rtnl_put_cacheinfo(skb, &rt->u.dst, id, ts, tsage,
2922                                expires, error) < 0)
2923                 goto nla_put_failure;
2924
2925         return nlmsg_end(skb, nlh);
2926
2927 nla_put_failure:
2928         nlmsg_cancel(skb, nlh);
2929         return -EMSGSIZE;
2930 }
2931
2932 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2933 {
2934         struct net *net = sock_net(in_skb->sk);
2935         struct rtmsg *rtm;
2936         struct nlattr *tb[RTA_MAX+1];
2937         struct rtable *rt = NULL;
2938         __be32 dst = 0;
2939         __be32 src = 0;
2940         u32 iif;
2941         int err;
2942         struct sk_buff *skb;
2943
2944         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
2945         if (err < 0)
2946                 goto errout;
2947
2948         rtm = nlmsg_data(nlh);
2949
2950         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2951         if (skb == NULL) {
2952                 err = -ENOBUFS;
2953                 goto errout;
2954         }
2955
2956         /* Reserve room for dummy headers, this skb can pass
2957            through good chunk of routing engine.
2958          */
2959         skb_reset_mac_header(skb);
2960         skb_reset_network_header(skb);
2961
2962         /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
2963         ip_hdr(skb)->protocol = IPPROTO_ICMP;
2964         skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2965
2966         src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
2967         dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
2968         iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
2969
2970         if (iif) {
2971                 struct net_device *dev;
2972
2973                 dev = __dev_get_by_index(net, iif);
2974                 if (dev == NULL) {
2975                         err = -ENODEV;
2976                         goto errout_free;
2977                 }
2978
2979                 skb->protocol   = htons(ETH_P_IP);
2980                 skb->dev        = dev;
2981                 local_bh_disable();
2982                 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2983                 local_bh_enable();
2984
2985                 rt = skb_rtable(skb);
2986                 if (err == 0 && rt->u.dst.error)
2987                         err = -rt->u.dst.error;
2988         } else {
2989                 struct flowi fl = {
2990                         .nl_u = {
2991                                 .ip4_u = {
2992                                         .daddr = dst,
2993                                         .saddr = src,
2994                                         .tos = rtm->rtm_tos,
2995                                 },
2996                         },
2997                         .oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
2998                 };
2999                 err = ip_route_output_key(net, &rt, &fl);
3000         }
3001
3002         if (err)
3003                 goto errout_free;
3004
3005         skb_dst_set(skb, &rt->u.dst);
3006         if (rtm->rtm_flags & RTM_F_NOTIFY)
3007                 rt->rt_flags |= RTCF_NOTIFY;
3008
3009         err = rt_fill_info(net, skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
3010                            RTM_NEWROUTE, 0, 0);
3011         if (err <= 0)
3012                 goto errout_free;
3013
3014         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
3015 errout:
3016         return err;
3017
3018 errout_free:
3019         kfree_skb(skb);
3020         goto errout;
3021 }
3022
3023 int ip_rt_dump(struct sk_buff *skb,  struct netlink_callback *cb)
3024 {
3025         struct rtable *rt;
3026         int h, s_h;
3027         int idx, s_idx;
3028         struct net *net;
3029
3030         net = sock_net(skb->sk);
3031
3032         s_h = cb->args[0];
3033         if (s_h < 0)
3034                 s_h = 0;
3035         s_idx = idx = cb->args[1];
3036         for (h = s_h; h <= rt_hash_mask; h++, s_idx = 0) {
3037                 if (!rt_hash_table[h].chain)
3038                         continue;
3039                 rcu_read_lock_bh();
3040                 for (rt = rcu_dereference_bh(rt_hash_table[h].chain), idx = 0; rt;
3041                      rt = rcu_dereference_bh(rt->u.dst.rt_next), idx++) {
3042                         if (!net_eq(dev_net(rt->u.dst.dev), net) || idx < s_idx)
3043                                 continue;
3044                         if (rt_is_expired(rt))
3045                                 continue;
3046                         skb_dst_set(skb, dst_clone(&rt->u.dst));
3047                         if (rt_fill_info(net, skb, NETLINK_CB(cb->skb).pid,
3048                                          cb->nlh->nlmsg_seq, RTM_NEWROUTE,
3049                                          1, NLM_F_MULTI) <= 0) {
3050                                 skb_dst_drop(skb);
3051                                 rcu_read_unlock_bh();
3052                                 goto done;
3053                         }
3054                         skb_dst_drop(skb);
3055                 }
3056                 rcu_read_unlock_bh();
3057         }
3058
3059 done:
3060         cb->args[0] = h;
3061         cb->args[1] = idx;
3062         return skb->len;
3063 }
3064
3065 void ip_rt_multicast_event(struct in_device *in_dev)
3066 {
3067         rt_cache_flush(dev_net(in_dev->dev), 0);
3068 }
3069
3070 #ifdef CONFIG_SYSCTL
3071 static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write,
3072                                         void __user *buffer,
3073                                         size_t *lenp, loff_t *ppos)
3074 {
3075         if (write) {
3076                 int flush_delay;
3077                 ctl_table ctl;
3078                 struct net *net;
3079
3080                 memcpy(&ctl, __ctl, sizeof(ctl));
3081                 ctl.data = &flush_delay;
3082                 proc_dointvec(&ctl, write, buffer, lenp, ppos);
3083
3084                 net = (struct net *)__ctl->extra1;
3085                 rt_cache_flush(net, flush_delay);
3086                 return 0;
3087         }
3088
3089         return -EINVAL;
3090 }
3091
3092 static void rt_secret_reschedule(int old)
3093 {
3094         struct net *net;
3095         int new = ip_rt_secret_interval;
3096         int diff = new - old;
3097
3098         if (!diff)
3099                 return;
3100
3101         rtnl_lock();
3102         for_each_net(net) {
3103                 int deleted = del_timer_sync(&net->ipv4.rt_secret_timer);
3104                 long time;
3105
3106                 if (!new)
3107                         continue;
3108
3109                 if (deleted) {
3110                         time = net->ipv4.rt_secret_timer.expires - jiffies;
3111
3112                         if (time <= 0 || (time += diff) <= 0)
3113                                 time = 0;
3114                 } else
3115                         time = new;
3116
3117                 mod_timer(&net->ipv4.rt_secret_timer, jiffies + time);
3118         }
3119         rtnl_unlock();
3120 }
3121
3122 static int ipv4_sysctl_rt_secret_interval(ctl_table *ctl, int write,
3123                                           void __user *buffer, size_t *lenp,
3124                                           loff_t *ppos)
3125 {
3126         int old = ip_rt_secret_interval;
3127         int ret = proc_dointvec_jiffies(ctl, write, buffer, lenp, ppos);
3128
3129         rt_secret_reschedule(old);
3130
3131         return ret;
3132 }
3133
3134 static ctl_table ipv4_route_table[] = {
3135         {
3136                 .procname       = "gc_thresh",
3137                 .data           = &ipv4_dst_ops.gc_thresh,
3138                 .maxlen         = sizeof(int),
3139                 .mode           = 0644,
3140                 .proc_handler   = proc_dointvec,
3141         },
3142         {
3143                 .procname       = "max_size",
3144                 .data           = &ip_rt_max_size,
3145                 .maxlen         = sizeof(int),
3146                 .mode           = 0644,
3147                 .proc_handler   = proc_dointvec,
3148         },
3149         {
3150                 /*  Deprecated. Use gc_min_interval_ms */
3151
3152                 .procname       = "gc_min_interval",
3153                 .data           = &ip_rt_gc_min_interval,
3154                 .maxlen         = sizeof(int),
3155                 .mode           = 0644,
3156                 .proc_handler   = proc_dointvec_jiffies,
3157         },
3158         {
3159                 .procname       = "gc_min_interval_ms",
3160                 .data           = &ip_rt_gc_min_interval,
3161                 .maxlen         = sizeof(int),
3162                 .mode           = 0644,
3163                 .proc_handler   = proc_dointvec_ms_jiffies,
3164         },
3165         {
3166                 .procname       = "gc_timeout",
3167                 .data           = &ip_rt_gc_timeout,
3168                 .maxlen         = sizeof(int),
3169                 .mode           = 0644,
3170                 .proc_handler   = proc_dointvec_jiffies,
3171         },
3172         {
3173                 .procname       = "gc_interval",
3174                 .data           = &ip_rt_gc_interval,
3175                 .maxlen         = sizeof(int),
3176                 .mode           = 0644,
3177                 .proc_handler   = proc_dointvec_jiffies,
3178         },
3179         {
3180                 .procname       = "redirect_load",
3181                 .data           = &ip_rt_redirect_load,
3182                 .maxlen         = sizeof(int),
3183                 .mode           = 0644,
3184                 .proc_handler   = proc_dointvec,
3185         },
3186         {
3187                 .procname       = "redirect_number",
3188                 .data           = &ip_rt_redirect_number,
3189                 .maxlen         = sizeof(int),
3190                 .mode           = 0644,
3191                 .proc_handler   = proc_dointvec,
3192         },
3193         {
3194                 .procname       = "redirect_silence",
3195                 .data           = &ip_rt_redirect_silence,
3196                 .maxlen         = sizeof(int),
3197                 .mode           = 0644,
3198                 .proc_handler   = proc_dointvec,
3199         },
3200         {
3201                 .procname       = "error_cost",
3202                 .data           = &ip_rt_error_cost,
3203                 .maxlen         = sizeof(int),
3204                 .mode           = 0644,
3205                 .proc_handler   = proc_dointvec,
3206         },
3207         {
3208                 .procname       = "error_burst",
3209                 .data           = &ip_rt_error_burst,
3210                 .maxlen         = sizeof(int),
3211                 .mode           = 0644,
3212                 .proc_handler   = proc_dointvec,
3213         },
3214         {
3215                 .procname       = "gc_elasticity",
3216                 .data           = &ip_rt_gc_elasticity,
3217                 .maxlen         = sizeof(int),
3218                 .mode           = 0644,
3219                 .proc_handler   = proc_dointvec,
3220         },
3221         {
3222                 .procname       = "mtu_expires",
3223                 .data           = &ip_rt_mtu_expires,
3224                 .maxlen         = sizeof(int),
3225                 .mode           = 0644,
3226                 .proc_handler   = proc_dointvec_jiffies,
3227         },
3228         {
3229                 .procname       = "min_pmtu",
3230                 .data           = &ip_rt_min_pmtu,
3231                 .maxlen         = sizeof(int),
3232                 .mode           = 0644,
3233                 .proc_handler   = proc_dointvec,
3234         },
3235         {
3236                 .procname       = "min_adv_mss",
3237                 .data           = &ip_rt_min_advmss,
3238                 .maxlen         = sizeof(int),
3239                 .mode           = 0644,
3240                 .proc_handler   = proc_dointvec,
3241         },
3242         {
3243                 .procname       = "secret_interval",
3244                 .data           = &ip_rt_secret_interval,
3245                 .maxlen         = sizeof(int),
3246                 .mode           = 0644,
3247                 .proc_handler   = ipv4_sysctl_rt_secret_interval,
3248         },
3249         { }
3250 };
3251
3252 static struct ctl_table empty[1];
3253
3254 static struct ctl_table ipv4_skeleton[] =
3255 {
3256         { .procname = "route",
3257           .mode = 0555, .child = ipv4_route_table},
3258         { .procname = "neigh",
3259           .mode = 0555, .child = empty},
3260         { }
3261 };
3262
3263 static __net_initdata struct ctl_path ipv4_path[] = {
3264         { .procname = "net", },
3265         { .procname = "ipv4", },
3266         { },
3267 };
3268
3269 static struct ctl_table ipv4_route_flush_table[] = {
3270         {
3271                 .procname       = "flush",
3272                 .maxlen         = sizeof(int),
3273                 .mode           = 0200,
3274                 .proc_handler   = ipv4_sysctl_rtcache_flush,
3275         },
3276         { },
3277 };
3278
3279 static __net_initdata struct ctl_path ipv4_route_path[] = {
3280         { .procname = "net", },
3281         { .procname = "ipv4", },
3282         { .procname = "route", },
3283         { },
3284 };
3285
3286 static __net_init int sysctl_route_net_init(struct net *net)
3287 {
3288         struct ctl_table *tbl;
3289
3290         tbl = ipv4_route_flush_table;
3291         if (!net_eq(net, &init_net)) {
3292                 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
3293                 if (tbl == NULL)
3294                         goto err_dup;
3295         }
3296         tbl[0].extra1 = net;
3297
3298         net->ipv4.route_hdr =
3299                 register_net_sysctl_table(net, ipv4_route_path, tbl);
3300         if (net->ipv4.route_hdr == NULL)
3301                 goto err_reg;
3302         return 0;
3303
3304 err_reg:
3305         if (tbl != ipv4_route_flush_table)
3306                 kfree(tbl);
3307 err_dup:
3308         return -ENOMEM;
3309 }
3310
3311 static __net_exit void sysctl_route_net_exit(struct net *net)
3312 {
3313         struct ctl_table *tbl;
3314
3315         tbl = net->ipv4.route_hdr->ctl_table_arg;
3316         unregister_net_sysctl_table(net->ipv4.route_hdr);
3317         BUG_ON(tbl == ipv4_route_flush_table);
3318         kfree(tbl);
3319 }
3320
3321 static __net_initdata struct pernet_operations sysctl_route_ops = {
3322         .init = sysctl_route_net_init,
3323         .exit = sysctl_route_net_exit,
3324 };
3325 #endif
3326
3327
3328 static __net_init int rt_secret_timer_init(struct net *net)
3329 {
3330         atomic_set(&net->ipv4.rt_genid,
3331                         (int) ((num_physpages ^ (num_physpages>>8)) ^
3332                         (jiffies ^ (jiffies >> 7))));
3333
3334         net->ipv4.rt_secret_timer.function = rt_secret_rebuild;
3335         net->ipv4.rt_secret_timer.data = (unsigned long)net;
3336         init_timer_deferrable(&net->ipv4.rt_secret_timer);
3337
3338         if (ip_rt_secret_interval) {
3339                 net->ipv4.rt_secret_timer.expires =
3340                         jiffies + net_random() % ip_rt_secret_interval +
3341                         ip_rt_secret_interval;
3342                 add_timer(&net->ipv4.rt_secret_timer);
3343         }
3344         return 0;
3345 }
3346
3347 static __net_exit void rt_secret_timer_exit(struct net *net)
3348 {
3349         del_timer_sync(&net->ipv4.rt_secret_timer);
3350 }
3351
3352 static __net_initdata struct pernet_operations rt_secret_timer_ops = {
3353         .init = rt_secret_timer_init,
3354         .exit = rt_secret_timer_exit,
3355 };
3356
3357
3358 #ifdef CONFIG_NET_CLS_ROUTE
3359 struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
3360 #endif /* CONFIG_NET_CLS_ROUTE */
3361
3362 static __initdata unsigned long rhash_entries;
3363 static int __init set_rhash_entries(char *str)
3364 {
3365         if (!str)
3366                 return 0;
3367         rhash_entries = simple_strtoul(str, &str, 0);
3368         return 1;
3369 }
3370 __setup("rhash_entries=", set_rhash_entries);
3371
3372 int __init ip_rt_init(void)
3373 {
3374         int rc = 0;
3375
3376 #ifdef CONFIG_NET_CLS_ROUTE
3377         ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
3378         if (!ip_rt_acct)
3379                 panic("IP: failed to allocate ip_rt_acct\n");
3380 #endif
3381
3382         ipv4_dst_ops.kmem_cachep =
3383                 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3384                                   SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3385
3386         ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3387
3388         rt_hash_table = (struct rt_hash_bucket *)
3389                 alloc_large_system_hash("IP route cache",
3390                                         sizeof(struct rt_hash_bucket),
3391                                         rhash_entries,
3392                                         (totalram_pages >= 128 * 1024) ?
3393                                         15 : 17,
3394                                         0,
3395                                         &rt_hash_log,
3396                                         &rt_hash_mask,
3397                                         rhash_entries ? 0 : 512 * 1024);
3398         memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
3399         rt_hash_lock_init();
3400
3401         ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
3402         ip_rt_max_size = (rt_hash_mask + 1) * 16;
3403
3404         devinet_init();
3405         ip_fib_init();
3406
3407         /* All the timers, started at system startup tend
3408            to synchronize. Perturb it a bit.
3409          */
3410         INIT_DELAYED_WORK_DEFERRABLE(&expires_work, rt_worker_func);
3411         expires_ljiffies = jiffies;
3412         schedule_delayed_work(&expires_work,
3413                 net_random() % ip_rt_gc_interval + ip_rt_gc_interval);
3414
3415         if (register_pernet_subsys(&rt_secret_timer_ops))
3416                 printk(KERN_ERR "Unable to setup rt_secret_timer\n");
3417
3418         if (ip_rt_proc_init())
3419                 printk(KERN_ERR "Unable to create route proc files\n");
3420 #ifdef CONFIG_XFRM
3421         xfrm_init();
3422         xfrm4_init(ip_rt_max_size);
3423 #endif
3424         rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL);
3425
3426 #ifdef CONFIG_SYSCTL
3427         register_pernet_subsys(&sysctl_route_ops);
3428 #endif
3429         return rc;
3430 }
3431
3432 #ifdef CONFIG_SYSCTL
3433 /*
3434  * We really need to sanitize the damn ipv4 init order, then all
3435  * this nonsense will go away.
3436  */
3437 void __init ip_static_sysctl_init(void)
3438 {
3439         register_sysctl_paths(ipv4_path, ipv4_skeleton);
3440 }
3441 #endif
3442
3443 EXPORT_SYMBOL(__ip_select_ident);
3444 EXPORT_SYMBOL(ip_route_input);
3445 EXPORT_SYMBOL(ip_route_output_key);