net/ipv4/route.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              ROUTE - implementation of the IP router.
   7  *
   8  * Authors:     Ross Biro
   9  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  10  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
  11  *              Linus Torvalds, <Linus.Torvalds@helsinki.fi>
  12  *              Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
  13  *
  14  * Fixes:
  15  *              Alan Cox        :       Verify area fixes.
  16  *              Alan Cox        :       cli() protects routing changes
  17  *              Rui Oliveira    :       ICMP routing table updates
  18  *              (rco@di.uminho.pt)      Routing table insertion and update
  19  *              Linus Torvalds  :       Rewrote bits to be sensible
  20  *              Alan Cox        :       Added BSD route gw semantics
  21  *              Alan Cox        :       Super /proc >4K
  22  *              Alan Cox        :       MTU in route table
  23  *              Alan Cox        :       MSS actually. Also added the window
  24  *                                      clamper.
  25  *              Sam Lantinga    :       Fixed route matching in rt_del()
  26  *              Alan Cox        :       Routing cache support.
  27  *              Alan Cox        :       Removed compatibility cruft.
  28  *              Alan Cox        :       RTF_REJECT support.
  29  *              Alan Cox        :       TCP irtt support.
  30  *              Jonathan Naylor :       Added Metric support.
  31  *      Miquel van Smoorenburg  :       BSD API fixes.
  32  *      Miquel van Smoorenburg  :       Metrics.
  33  *              Alan Cox        :       Use __u32 properly
  34  *              Alan Cox        :       Aligned routing errors more closely with BSD
  35  *                                      our system is still very different.
  36  *              Alan Cox        :       Faster /proc handling
  37  *      Alexey Kuznetsov        :       Massive rework to support tree based routing,
  38  *                                      routing caches and better behaviour.
  39  *
  40  *              Olaf Erb        :       irtt wasn't being copied right.
  41  *              Bjorn Ekwall    :       Kerneld route support.
  42  *              Alan Cox        :       Multicast fixed (I hope)
  43  *              Pavel Krauz     :       Limited broadcast fixed
  44  *              Mike McLagan    :       Routing by source
  45  *      Alexey Kuznetsov        :       End of old history. Split to fib.c and
  46  *                                      route.c and rewritten from scratch.
  47  *              Andi Kleen      :       Load-limit warning messages.
  48  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
  49  *      Vitaly E. Lavrov        :       Race condition in ip_route_input_slow.
  50  *      Tobias Ringstrom        :       Uninitialized res.type in ip_route_output_slow.
  51  *      Vladimir V. Ivanov      :       IP rule info (flowid) is really useful.
  52  *              Marc Boucher    :       routing by fwmark
  53  *      Robert Olsson           :       Added rt_cache statistics
  54  *      Arnaldo C. Melo         :       Convert proc stuff to seq_file
  55  *      Eric Dumazet            :       hashed spinlocks and rt_check_expire() fixes.
  56  *      Ilia Sotnikov           :       Ignore TOS on PMTUD and Redirect
  57  *      Ilia Sotnikov           :       Removed TOS from hash calculations
  58  *
  59  *              This program is free software; you can redistribute it and/or
  60  *              modify it under the terms of the GNU General Public License
  61  *              as published by the Free Software Foundation; either version
  62  *              2 of the License, or (at your option) any later version.
  63  */
  64
  65 #include <linux/module.h>
  66 #include <asm/uaccess.h>
  67 #include <asm/system.h>
  68 #include <linux/bitops.h>
  69 #include <linux/types.h>
  70 #include <linux/kernel.h>
  71 #include <linux/mm.h>
  72 #include <linux/bootmem.h>
  73 #include <linux/string.h>
  74 #include <linux/socket.h>
  75 #include <linux/sockios.h>
  76 #include <linux/errno.h>
  77 #include <linux/in.h>
  78 #include <linux/inet.h>
  79 #include <linux/netdevice.h>
  80 #include <linux/proc_fs.h>
  81 #include <linux/init.h>
  82 #include <linux/workqueue.h>
  83 #include <linux/skbuff.h>
  84 #include <linux/inetdevice.h>
  85 #include <linux/igmp.h>
  86 #include <linux/pkt_sched.h>
  87 #include <linux/mroute.h>
  88 #include <linux/netfilter_ipv4.h>
  89 #include <linux/random.h>
  90 #include <linux/jhash.h>
  91 #include <linux/rcupdate.h>
  92 #include <linux/times.h>
  93 #include <linux/slab.h>
  94 #include <net/dst.h>
  95 #include <net/net_namespace.h>
  96 #include <net/protocol.h>
  97 #include <net/ip.h>
  98 #include <net/route.h>
  99 #include <net/inetpeer.h>
 100 #include <net/sock.h>
 101 #include <net/ip_fib.h>
 102 #include <net/arp.h>
 103 #include <net/tcp.h>
 104 #include <net/icmp.h>
 105 #include <net/xfrm.h>
 106 #include <net/netevent.h>
 107 #include <net/rtnetlink.h>
 108 #ifdef CONFIG_SYSCTL
 109 #include <linux/sysctl.h>
 110 #endif
 111 #include <net/atmclip.h>
 112 #include <net/secure_seq.h>
 113
 114 #define RT_FL_TOS(oldflp4) \
 115     ((u32)(oldflp4->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK)))
 116
 117 #define IP_MAX_MTU      0xFFF0
 118
 119 #define RT_GC_TIMEOUT (300*HZ)
 120
 121 static int ip_rt_max_size;
 122 static int ip_rt_gc_timeout __read_mostly       = RT_GC_TIMEOUT;
 123 static int ip_rt_gc_interval __read_mostly      = 60 * HZ;
 124 static int ip_rt_gc_min_interval __read_mostly  = HZ / 2;
 125 static int ip_rt_redirect_number __read_mostly  = 9;
 126 static int ip_rt_redirect_load __read_mostly    = HZ / 50;
 127 static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
 128 static int ip_rt_error_cost __read_mostly       = HZ;
 129 static int ip_rt_error_burst __read_mostly      = 5 * HZ;
 130 static int ip_rt_gc_elasticity __read_mostly    = 8;
 131 static int ip_rt_mtu_expires __read_mostly      = 10 * 60 * HZ;
 132 static int ip_rt_min_pmtu __read_mostly         = 512 + 20 + 20;
 133 static int ip_rt_min_advmss __read_mostly       = 256;
 134 static int rt_chain_length_max __read_mostly    = 20;
 135
 136 /*
 137  *      Interface to generic destination cache.
 138  */
 139
 140 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
 141 static unsigned int      ipv4_default_advmss(const struct dst_entry *dst);
 142 static unsigned int      ipv4_default_mtu(const struct dst_entry *dst);
 143 static void              ipv4_dst_destroy(struct dst_entry *dst);
 144 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
 145 static void              ipv4_link_failure(struct sk_buff *skb);
 146 static void              ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
 147 static int rt_garbage_collect(struct dst_ops *ops);
 148
 149 static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
 150                             int how)
 151 {
 152 }
 153
 154 static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
 155 {
 156         struct rtable *rt = (struct rtable *) dst;
 157         struct inet_peer *peer;
 158         u32 *p = NULL;
 159
 160         if (!rt->peer)
 161                 rt_bind_peer(rt, rt->rt_dst, 1);
 162
 163         peer = rt->peer;
 164         if (peer) {
 165                 u32 *old_p = __DST_METRICS_PTR(old);
 166                 unsigned long prev, new;
 167
 168                 p = peer->metrics;
 169                 if (inet_metrics_new(peer))
 170                         memcpy(p, old_p, sizeof(u32) * RTAX_MAX);
 171
 172                 new = (unsigned long) p;
 173                 prev = cmpxchg(&dst->_metrics, old, new);
 174
 175                 if (prev != old) {
 176                         p = __DST_METRICS_PTR(prev);
 177                         if (prev & DST_METRICS_READ_ONLY)
 178                                 p = NULL;
 179                 } else {
 180                         if (rt->fi) {
 181                                 fib_info_put(rt->fi);
 182                                 rt->fi = NULL;
 183                         }
 184                 }
 185         }
 186         return p;
 187 }
 188
 189 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, const void *daddr);
 190
 191 static struct dst_ops ipv4_dst_ops = {
 192         .family =               AF_INET,
 193         .protocol =             cpu_to_be16(ETH_P_IP),
 194         .gc =                   rt_garbage_collect,
 195         .check =                ipv4_dst_check,
 196         .default_advmss =       ipv4_default_advmss,
 197         .default_mtu =          ipv4_default_mtu,
 198         .cow_metrics =          ipv4_cow_metrics,
 199         .destroy =              ipv4_dst_destroy,
 200         .ifdown =               ipv4_dst_ifdown,
 201         .negative_advice =      ipv4_negative_advice,
 202         .link_failure =         ipv4_link_failure,
 203         .update_pmtu =          ip_rt_update_pmtu,
 204         .local_out =            __ip_local_out,
 205         .neigh_lookup =         ipv4_neigh_lookup,
 206 };
 207
 208 #define ECN_OR_COST(class)      TC_PRIO_##class
 209
 210 const __u8 ip_tos2prio[16] = {
 211         TC_PRIO_BESTEFFORT,
 212         ECN_OR_COST(BESTEFFORT),
 213         TC_PRIO_BESTEFFORT,
 214         ECN_OR_COST(BESTEFFORT),
 215         TC_PRIO_BULK,
 216         ECN_OR_COST(BULK),
 217         TC_PRIO_BULK,
 218         ECN_OR_COST(BULK),
 219         TC_PRIO_INTERACTIVE,
 220         ECN_OR_COST(INTERACTIVE),
 221         TC_PRIO_INTERACTIVE,
 222         ECN_OR_COST(INTERACTIVE),
 223         TC_PRIO_INTERACTIVE_BULK,
 224         ECN_OR_COST(INTERACTIVE_BULK),
 225         TC_PRIO_INTERACTIVE_BULK,
 226         ECN_OR_COST(INTERACTIVE_BULK)
 227 };
 228
 229
 230 /*
 231  * Route cache.
 232  */
 233
 234 /* The locking scheme is rather straight forward:
 235  *
 236  * 1) Read-Copy Update protects the buckets of the central route hash.
 237  * 2) Only writers remove entries, and they hold the lock
 238  *    as they look at rtable reference counts.
 239  * 3) Only readers acquire references to rtable entries,
 240  *    they do so with atomic increments and with the
 241  *    lock held.
 242  */
 243
 244 struct rt_hash_bucket {
 245         struct rtable __rcu     *chain;
 246 };
 247
 248 #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
 249         defined(CONFIG_PROVE_LOCKING)
 250 /*
 251  * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
 252  * The size of this table is a power of two and depends on the number of CPUS.
 253  * (on lockdep we have a quite big spinlock_t, so keep the size down there)
 254  */
 255 #ifdef CONFIG_LOCKDEP
 256 # define RT_HASH_LOCK_SZ        256
 257 #else
 258 # if NR_CPUS >= 32
 259 #  define RT_HASH_LOCK_SZ       4096
 260 # elif NR_CPUS >= 16
 261 #  define RT_HASH_LOCK_SZ       2048
 262 # elif NR_CPUS >= 8
 263 #  define RT_HASH_LOCK_SZ       1024
 264 # elif NR_CPUS >= 4
 265 #  define RT_HASH_LOCK_SZ       512
 266 # else
 267 #  define RT_HASH_LOCK_SZ       256
 268 # endif
 269 #endif
 270
 271 static spinlock_t       *rt_hash_locks;
 272 # define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
 273
 274 static __init void rt_hash_lock_init(void)
 275 {
 276         int i;
 277
 278         rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ,
 279                         GFP_KERNEL);
 280         if (!rt_hash_locks)
 281                 panic("IP: failed to allocate rt_hash_locks\n");
 282
 283         for (i = 0; i < RT_HASH_LOCK_SZ; i++)
 284                 spin_lock_init(&rt_hash_locks[i]);
 285 }
 286 #else
 287 # define rt_hash_lock_addr(slot) NULL
 288
 289 static inline void rt_hash_lock_init(void)
 290 {
 291 }
 292 #endif
 293
 294 static struct rt_hash_bucket    *rt_hash_table __read_mostly;
 295 static unsigned                 rt_hash_mask __read_mostly;
 296 static unsigned int             rt_hash_log  __read_mostly;
 297
 298 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
 299 #define RT_CACHE_STAT_INC(field) __this_cpu_inc(rt_cache_stat.field)
 300
 301 static inline unsigned int rt_hash(__be32 daddr, __be32 saddr, int idx,
 302                                    int genid)
 303 {
 304         return jhash_3words((__force u32)daddr, (__force u32)saddr,
 305                             idx, genid)
 306                 & rt_hash_mask;
 307 }
 308
 309 static inline int rt_genid(struct net *net)
 310 {
 311         return atomic_read(&net->ipv4.rt_genid);
 312 }
 313
 314 #ifdef CONFIG_PROC_FS
 315 struct rt_cache_iter_state {
 316         struct seq_net_private p;
 317         int bucket;
 318         int genid;
 319 };
 320
 321 static struct rtable *rt_cache_get_first(struct seq_file *seq)
 322 {
 323         struct rt_cache_iter_state *st = seq->private;
 324         struct rtable *r = NULL;
 325
 326         for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
 327                 if (!rcu_dereference_raw(rt_hash_table[st->bucket].chain))
 328                         continue;
 329                 rcu_read_lock_bh();
 330                 r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
 331                 while (r) {
 332                         if (dev_net(r->dst.dev) == seq_file_net(seq) &&
 333                             r->rt_genid == st->genid)
 334                                 return r;
 335                         r = rcu_dereference_bh(r->dst.rt_next);
 336                 }
 337                 rcu_read_unlock_bh();
 338         }
 339         return r;
 340 }
 341
 342 static struct rtable *__rt_cache_get_next(struct seq_file *seq,
 343                                           struct rtable *r)
 344 {
 345         struct rt_cache_iter_state *st = seq->private;
 346
 347         r = rcu_dereference_bh(r->dst.rt_next);
 348         while (!r) {
 349                 rcu_read_unlock_bh();
 350                 do {
 351                         if (--st->bucket < 0)
 352                                 return NULL;
 353                 } while (!rcu_dereference_raw(rt_hash_table[st->bucket].chain));
 354                 rcu_read_lock_bh();
 355                 r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
 356         }
 357         return r;
 358 }
 359
 360 static struct rtable *rt_cache_get_next(struct seq_file *seq,
 361                                         struct rtable *r)
 362 {
 363         struct rt_cache_iter_state *st = seq->private;
 364         while ((r = __rt_cache_get_next(seq, r)) != NULL) {
 365                 if (dev_net(r->dst.dev) != seq_file_net(seq))
 366                         continue;
 367                 if (r->rt_genid == st->genid)
 368                         break;
 369         }
 370         return r;
 371 }
 372
 373 static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
 374 {
 375         struct rtable *r = rt_cache_get_first(seq);
 376
 377         if (r)
 378                 while (pos && (r = rt_cache_get_next(seq, r)))
 379                         --pos;
 380         return pos ? NULL : r;
 381 }
 382
 383 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
 384 {
 385         struct rt_cache_iter_state *st = seq->private;
 386         if (*pos)
 387                 return rt_cache_get_idx(seq, *pos - 1);
 388         st->genid = rt_genid(seq_file_net(seq));
 389         return SEQ_START_TOKEN;
 390 }
 391
 392 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 393 {
 394         struct rtable *r;
 395
 396         if (v == SEQ_START_TOKEN)
 397                 r = rt_cache_get_first(seq);
 398         else
 399                 r = rt_cache_get_next(seq, v);
 400         ++*pos;
 401         return r;
 402 }
 403
 404 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
 405 {
 406         if (v && v != SEQ_START_TOKEN)
 407                 rcu_read_unlock_bh();
 408 }
 409
 410 static int rt_cache_seq_show(struct seq_file *seq, void *v)
 411 {
 412         if (v == SEQ_START_TOKEN)
 413                 seq_printf(seq, "%-127s\n",
 414                            "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
 415                            "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
 416                            "HHUptod\tSpecDst");
 417         else {
 418                 struct rtable *r = v;
 419                 struct neighbour *n;
 420                 int len;
 421
 422                 n = dst_get_neighbour(&r->dst);
 423                 seq_printf(seq, "%s\t%08X\t%08X\t%8X\t%d\t%u\t%d\t"
 424                               "%08X\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X%n",
 425                         r->dst.dev ? r->dst.dev->name : "*",
 426                         (__force u32)r->rt_dst,
 427                         (__force u32)r->rt_gateway,
 428                         r->rt_flags, atomic_read(&r->dst.__refcnt),
 429                         r->dst.__use, 0, (__force u32)r->rt_src,
 430                         dst_metric_advmss(&r->dst) + 40,
 431                         dst_metric(&r->dst, RTAX_WINDOW),
 432                         (int)((dst_metric(&r->dst, RTAX_RTT) >> 3) +
 433                               dst_metric(&r->dst, RTAX_RTTVAR)),
 434                         r->rt_key_tos,
 435                         -1,
 436                         (n && (n->nud_state & NUD_CONNECTED)) ? 1 : 0,
 437                         r->rt_spec_dst, &len);
 438
 439                 seq_printf(seq, "%*s\n", 127 - len, "");
 440         }
 441         return 0;
 442 }
 443
 444 static const struct seq_operations rt_cache_seq_ops = {
 445         .start  = rt_cache_seq_start,
 446         .next   = rt_cache_seq_next,
 447         .stop   = rt_cache_seq_stop,
 448         .show   = rt_cache_seq_show,
 449 };
 450
 451 static int rt_cache_seq_open(struct inode *inode, struct file *file)
 452 {
 453         return seq_open_net(inode, file, &rt_cache_seq_ops,
 454                         sizeof(struct rt_cache_iter_state));
 455 }
 456
 457 static const struct file_operations rt_cache_seq_fops = {
 458         .owner   = THIS_MODULE,
 459         .open    = rt_cache_seq_open,
 460         .read    = seq_read,
 461         .llseek  = seq_lseek,
 462         .release = seq_release_net,
 463 };
 464
 465
 466 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
 467 {
 468         int cpu;
 469
 470         if (*pos == 0)
 471                 return SEQ_START_TOKEN;
 472
 473         for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
 474                 if (!cpu_possible(cpu))
 475                         continue;
 476                 *pos = cpu+1;
 477                 return &per_cpu(rt_cache_stat, cpu);
 478         }
 479         return NULL;
 480 }
 481
 482 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 483 {
 484         int cpu;
 485
 486         for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
 487                 if (!cpu_possible(cpu))
 488                         continue;
 489                 *pos = cpu+1;
 490                 return &per_cpu(rt_cache_stat, cpu);
 491         }
 492         return NULL;
 493
 494 }
 495
 496 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
 497 {
 498
 499 }
 500
 501 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
 502 {
 503         struct rt_cache_stat *st = v;
 504
 505         if (v == SEQ_START_TOKEN) {
 506                 seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
 507                 return 0;
 508         }
 509
 510         seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
 511                    " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
 512                    dst_entries_get_slow(&ipv4_dst_ops),
 513                    st->in_hit,
 514                    st->in_slow_tot,
 515                    st->in_slow_mc,
 516                    st->in_no_route,
 517                    st->in_brd,
 518                    st->in_martian_dst,
 519                    st->in_martian_src,
 520
 521                    st->out_hit,
 522                    st->out_slow_tot,
 523                    st->out_slow_mc,
 524
 525                    st->gc_total,
 526                    st->gc_ignored,
 527                    st->gc_goal_miss,
 528                    st->gc_dst_overflow,
 529                    st->in_hlist_search,
 530                    st->out_hlist_search
 531                 );
 532         return 0;
 533 }
 534
 535 static const struct seq_operations rt_cpu_seq_ops = {
 536         .start  = rt_cpu_seq_start,
 537         .next   = rt_cpu_seq_next,
 538         .stop   = rt_cpu_seq_stop,
 539         .show   = rt_cpu_seq_show,
 540 };
 541
 542
 543 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
 544 {
 545         return seq_open(file, &rt_cpu_seq_ops);
 546 }
 547
 548 static const struct file_operations rt_cpu_seq_fops = {
 549         .owner   = THIS_MODULE,
 550         .open    = rt_cpu_seq_open,
 551         .read    = seq_read,
 552         .llseek  = seq_lseek,
 553         .release = seq_release,
 554 };
 555
 556 #ifdef CONFIG_IP_ROUTE_CLASSID
 557 static int rt_acct_proc_show(struct seq_file *m, void *v)
 558 {
 559         struct ip_rt_acct *dst, *src;
 560         unsigned int i, j;
 561
 562         dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
 563         if (!dst)
 564                 return -ENOMEM;
 565
 566         for_each_possible_cpu(i) {
 567                 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
 568                 for (j = 0; j < 256; j++) {
 569                         dst[j].o_bytes   += src[j].o_bytes;
 570                         dst[j].o_packets += src[j].o_packets;
 571                         dst[j].i_bytes   += src[j].i_bytes;
 572                         dst[j].i_packets += src[j].i_packets;
 573                 }
 574         }
 575
 576         seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
 577         kfree(dst);
 578         return 0;
 579 }
 580
 581 static int rt_acct_proc_open(struct inode *inode, struct file *file)
 582 {
 583         return single_open(file, rt_acct_proc_show, NULL);
 584 }
 585
 586 static const struct file_operations rt_acct_proc_fops = {
 587         .owner          = THIS_MODULE,
 588         .open           = rt_acct_proc_open,
 589         .read           = seq_read,
 590         .llseek         = seq_lseek,
 591         .release        = single_release,
 592 };
 593 #endif
 594
 595 static int __net_init ip_rt_do_proc_init(struct net *net)
 596 {
 597         struct proc_dir_entry *pde;
 598
 599         pde = proc_net_fops_create(net, "rt_cache", S_IRUGO,
 600                         &rt_cache_seq_fops);
 601         if (!pde)
 602                 goto err1;
 603
 604         pde = proc_create("rt_cache", S_IRUGO,
 605                           net->proc_net_stat, &rt_cpu_seq_fops);
 606         if (!pde)
 607                 goto err2;
 608
 609 #ifdef CONFIG_IP_ROUTE_CLASSID
 610         pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
 611         if (!pde)
 612                 goto err3;
 613 #endif
 614         return 0;
 615
 616 #ifdef CONFIG_IP_ROUTE_CLASSID
 617 err3:
 618         remove_proc_entry("rt_cache", net->proc_net_stat);
 619 #endif
 620 err2:
 621         remove_proc_entry("rt_cache", net->proc_net);
 622 err1:
 623         return -ENOMEM;
 624 }
 625
 626 static void __net_exit ip_rt_do_proc_exit(struct net *net)
 627 {
 628         remove_proc_entry("rt_cache", net->proc_net_stat);
 629         remove_proc_entry("rt_cache", net->proc_net);
 630 #ifdef CONFIG_IP_ROUTE_CLASSID
 631         remove_proc_entry("rt_acct", net->proc_net);
 632 #endif
 633 }
 634
 635 static struct pernet_operations ip_rt_proc_ops __net_initdata =  {
 636         .init = ip_rt_do_proc_init,
 637         .exit = ip_rt_do_proc_exit,
 638 };
 639
 640 static int __init ip_rt_proc_init(void)
 641 {
 642         return register_pernet_subsys(&ip_rt_proc_ops);
 643 }
 644
 645 #else
 646 static inline int ip_rt_proc_init(void)
 647 {
 648         return 0;
 649 }
 650 #endif /* CONFIG_PROC_FS */
 651
 652 static inline void rt_free(struct rtable *rt)
 653 {
 654         call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
 655 }
 656
 657 static inline void rt_drop(struct rtable *rt)
 658 {
 659         ip_rt_put(rt);
 660         call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
 661 }
 662
 663 static inline int rt_fast_clean(struct rtable *rth)
 664 {
 665         /* Kill broadcast/multicast entries very aggresively, if they
 666            collide in hash table with more useful entries */
 667         return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
 668                 rt_is_input_route(rth) && rth->dst.rt_next;
 669 }
 670
 671 static inline int rt_valuable(struct rtable *rth)
 672 {
 673         return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
 674                 (rth->peer && rth->peer->pmtu_expires);
 675 }
 676
 677 static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
 678 {
 679         unsigned long age;
 680         int ret = 0;
 681
 682         if (atomic_read(&rth->dst.__refcnt))
 683                 goto out;
 684
 685         age = jiffies - rth->dst.lastuse;
 686         if ((age <= tmo1 && !rt_fast_clean(rth)) ||
 687             (age <= tmo2 && rt_valuable(rth)))
 688                 goto out;
 689         ret = 1;
 690 out:    return ret;
 691 }
 692
 693 /* Bits of score are:
 694  * 31: very valuable
 695  * 30: not quite useless
 696  * 29..0: usage counter
 697  */
 698 static inline u32 rt_score(struct rtable *rt)
 699 {
 700         u32 score = jiffies - rt->dst.lastuse;
 701
 702         score = ~score & ~(3<<30);
 703
 704         if (rt_valuable(rt))
 705                 score |= (1<<31);
 706
 707         if (rt_is_output_route(rt) ||
 708             !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
 709                 score |= (1<<30);
 710
 711         return score;
 712 }
 713
 714 static inline bool rt_caching(const struct net *net)
 715 {
 716         return net->ipv4.current_rt_cache_rebuild_count <=
 717                 net->ipv4.sysctl_rt_cache_rebuild_count;
 718 }
 719
 720 static inline bool compare_hash_inputs(const struct rtable *rt1,
 721                                        const struct rtable *rt2)
 722 {
 723         return ((((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) |
 724                 ((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) |
 725                 (rt1->rt_route_iif ^ rt2->rt_route_iif)) == 0);
 726 }
 727
 728 static inline int compare_keys(struct rtable *rt1, struct rtable *rt2)
 729 {
 730         return (((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) |
 731                 ((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) |
 732                 (rt1->rt_mark ^ rt2->rt_mark) |
 733                 (rt1->rt_key_tos ^ rt2->rt_key_tos) |
 734                 (rt1->rt_route_iif ^ rt2->rt_route_iif) |
 735                 (rt1->rt_oif ^ rt2->rt_oif)) == 0;
 736 }
 737
 738 static inline int compare_netns(struct rtable *rt1, struct rtable *rt2)
 739 {
 740         return net_eq(dev_net(rt1->dst.dev), dev_net(rt2->dst.dev));
 741 }
 742
 743 static inline int rt_is_expired(struct rtable *rth)
 744 {
 745         return rth->rt_genid != rt_genid(dev_net(rth->dst.dev));
 746 }
 747
 748 /*
 749  * Perform a full scan of hash table and free all entries.
 750  * Can be called by a softirq or a process.
 751  * In the later case, we want to be reschedule if necessary
 752  */
 753 static void rt_do_flush(struct net *net, int process_context)
 754 {
 755         unsigned int i;
 756         struct rtable *rth, *next;
 757
 758         for (i = 0; i <= rt_hash_mask; i++) {
 759                 struct rtable __rcu **pprev;
 760                 struct rtable *list;
 761
 762                 if (process_context && need_resched())
 763                         cond_resched();
 764                 rth = rcu_dereference_raw(rt_hash_table[i].chain);
 765                 if (!rth)
 766                         continue;
 767
 768                 spin_lock_bh(rt_hash_lock_addr(i));
 769
 770                 list = NULL;
 771                 pprev = &rt_hash_table[i].chain;
 772                 rth = rcu_dereference_protected(*pprev,
 773                         lockdep_is_held(rt_hash_lock_addr(i)));
 774
 775                 while (rth) {
 776                         next = rcu_dereference_protected(rth->dst.rt_next,
 777                                 lockdep_is_held(rt_hash_lock_addr(i)));
 778
 779                         if (!net ||
 780                             net_eq(dev_net(rth->dst.dev), net)) {
 781                                 rcu_assign_pointer(*pprev, next);
 782                                 rcu_assign_pointer(rth->dst.rt_next, list);
 783                                 list = rth;
 784                         } else {
 785                                 pprev = &rth->dst.rt_next;
 786                         }
 787                         rth = next;
 788                 }
 789
 790                 spin_unlock_bh(rt_hash_lock_addr(i));
 791
 792                 for (; list; list = next) {
 793                         next = rcu_dereference_protected(list->dst.rt_next, 1);
 794                         rt_free(list);
 795                 }
 796         }
 797 }
 798
 799 /*
 800  * While freeing expired entries, we compute average chain length
 801  * and standard deviation, using fixed-point arithmetic.
 802  * This to have an estimation of rt_chain_length_max
 803  *  rt_chain_length_max = max(elasticity, AVG + 4*SD)
 804  * We use 3 bits for frational part, and 29 (or 61) for magnitude.
 805  */
 806
 807 #define FRACT_BITS 3
 808 #define ONE (1UL << FRACT_BITS)
 809
 810 /*
 811  * Given a hash chain and an item in this hash chain,
 812  * find if a previous entry has the same hash_inputs
 813  * (but differs on tos, mark or oif)
 814  * Returns 0 if an alias is found.
 815  * Returns ONE if rth has no alias before itself.
 816  */
 817 static int has_noalias(const struct rtable *head, const struct rtable *rth)
 818 {
 819         const struct rtable *aux = head;
 820
 821         while (aux != rth) {
 822                 if (compare_hash_inputs(aux, rth))
 823                         return 0;
 824                 aux = rcu_dereference_protected(aux->dst.rt_next, 1);
 825         }
 826         return ONE;
 827 }
 828
 829 /*
 830  * Perturbation of rt_genid by a small quantity [1..256]
 831  * Using 8 bits of shuffling ensure we can call rt_cache_invalidate()
 832  * many times (2^24) without giving recent rt_genid.
 833  * Jenkins hash is strong enough that litle changes of rt_genid are OK.
 834  */
 835 static void rt_cache_invalidate(struct net *net)
 836 {
 837         unsigned char shuffle;
 838
 839         get_random_bytes(&shuffle, sizeof(shuffle));
 840         atomic_add(shuffle + 1U, &net->ipv4.rt_genid);
 841 }
 842
 843 /*
 844  * delay < 0  : invalidate cache (fast : entries will be deleted later)
 845  * delay >= 0 : invalidate & flush cache (can be long)
 846  */
 847 void rt_cache_flush(struct net *net, int delay)
 848 {
 849         rt_cache_invalidate(net);
 850         if (delay >= 0)
 851                 rt_do_flush(net, !in_softirq());
 852 }
 853
 854 /* Flush previous cache invalidated entries from the cache */
 855 void rt_cache_flush_batch(struct net *net)
 856 {
 857         rt_do_flush(net, !in_softirq());
 858 }
 859
 860 static void rt_emergency_hash_rebuild(struct net *net)
 861 {
 862         if (net_ratelimit())
 863                 printk(KERN_WARNING "Route hash chain too long!\n");
 864         rt_cache_invalidate(net);
 865 }
 866
 867 /*
 868    Short description of GC goals.
 869
 870    We want to build algorithm, which will keep routing cache
 871    at some equilibrium point, when number of aged off entries
 872    is kept approximately equal to newly generated ones.
 873
 874    Current expiration strength is variable "expire".
 875    We try to adjust it dynamically, so that if networking
 876    is idle expires is large enough to keep enough of warm entries,
 877    and when load increases it reduces to limit cache size.
 878  */
 879
 880 static int rt_garbage_collect(struct dst_ops *ops)
 881 {
 882         static unsigned long expire = RT_GC_TIMEOUT;
 883         static unsigned long last_gc;
 884         static int rover;
 885         static int equilibrium;
 886         struct rtable *rth;
 887         struct rtable __rcu **rthp;
 888         unsigned long now = jiffies;
 889         int goal;
 890         int entries = dst_entries_get_fast(&ipv4_dst_ops);
 891
 892         /*
 893          * Garbage collection is pretty expensive,
 894          * do not make it too frequently.
 895          */
 896
 897         RT_CACHE_STAT_INC(gc_total);
 898
 899         if (now - last_gc < ip_rt_gc_min_interval &&
 900             entries < ip_rt_max_size) {
 901                 RT_CACHE_STAT_INC(gc_ignored);
 902                 goto out;
 903         }
 904
 905         entries = dst_entries_get_slow(&ipv4_dst_ops);
 906         /* Calculate number of entries, which we want to expire now. */
 907         goal = entries - (ip_rt_gc_elasticity << rt_hash_log);
 908         if (goal <= 0) {
 909                 if (equilibrium < ipv4_dst_ops.gc_thresh)
 910                         equilibrium = ipv4_dst_ops.gc_thresh;
 911                 goal = entries - equilibrium;
 912                 if (goal > 0) {
 913                         equilibrium += min_t(unsigned int, goal >> 1, rt_hash_mask + 1);
 914                         goal = entries - equilibrium;
 915                 }
 916         } else {
 917                 /* We are in dangerous area. Try to reduce cache really
 918                  * aggressively.
 919                  */
 920                 goal = max_t(unsigned int, goal >> 1, rt_hash_mask + 1);
 921                 equilibrium = entries - goal;
 922         }
 923
 924         if (now - last_gc >= ip_rt_gc_min_interval)
 925                 last_gc = now;
 926
 927         if (goal <= 0) {
 928                 equilibrium += goal;
 929                 goto work_done;
 930         }
 931
 932         do {
 933                 int i, k;
 934
 935                 for (i = rt_hash_mask, k = rover; i >= 0; i--) {
 936                         unsigned long tmo = expire;
 937
 938                         k = (k + 1) & rt_hash_mask;
 939                         rthp = &rt_hash_table[k].chain;
 940                         spin_lock_bh(rt_hash_lock_addr(k));
 941                         while ((rth = rcu_dereference_protected(*rthp,
 942                                         lockdep_is_held(rt_hash_lock_addr(k)))) != NULL) {
 943                                 if (!rt_is_expired(rth) &&
 944                                         !rt_may_expire(rth, tmo, expire)) {
 945                                         tmo >>= 1;
 946                                         rthp = &rth->dst.rt_next;
 947                                         continue;
 948                                 }
 949                                 *rthp = rth->dst.rt_next;
 950                                 rt_free(rth);
 951                                 goal--;
 952                         }
 953                         spin_unlock_bh(rt_hash_lock_addr(k));
 954                         if (goal <= 0)
 955                                 break;
 956                 }
 957                 rover = k;
 958
 959                 if (goal <= 0)
 960                         goto work_done;
 961
 962                 /* Goal is not achieved. We stop process if:
 963
 964                    - if expire reduced to zero. Otherwise, expire is halfed.
 965                    - if table is not full.
 966                    - if we are called from interrupt.
 967                    - jiffies check is just fallback/debug loop breaker.
 968                      We will not spin here for long time in any case.
 969                  */
 970
 971                 RT_CACHE_STAT_INC(gc_goal_miss);
 972
 973                 if (expire == 0)
 974                         break;
 975
 976                 expire >>= 1;
 977
 978                 if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
 979                         goto out;
 980         } while (!in_softirq() && time_before_eq(jiffies, now));
 981
 982         if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
 983                 goto out;
 984         if (dst_entries_get_slow(&ipv4_dst_ops) < ip_rt_max_size)
 985                 goto out;
 986         if (net_ratelimit())
 987                 printk(KERN_WARNING "dst cache overflow\n");
 988         RT_CACHE_STAT_INC(gc_dst_overflow);
 989         return 1;
 990
 991 work_done:
 992         expire += ip_rt_gc_min_interval;
 993         if (expire > ip_rt_gc_timeout ||
 994             dst_entries_get_fast(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh ||
 995             dst_entries_get_slow(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh)
 996                 expire = ip_rt_gc_timeout;
 997 out:    return 0;
 998 }
 999
1000 /*
1001  * Returns number of entries in a hash chain that have different hash_inputs
1002  */
1003 static int slow_chain_length(const struct rtable *head)
1004 {
1005         int length = 0;
1006         const struct rtable *rth = head;
1007
1008         while (rth) {
1009                 length += has_noalias(head, rth);
1010                 rth = rcu_dereference_protected(rth->dst.rt_next, 1);
1011         }
1012         return length >> FRACT_BITS;
1013 }
1014
1015 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, const void *daddr)
1016 {
1017         struct neigh_table *tbl = &arp_tbl;
1018         static const __be32 inaddr_any = 0;
1019         struct net_device *dev = dst->dev;
1020         const __be32 *pkey = daddr;
1021         struct neighbour *n;
1022
1023 #if defined(CONFIG_ATM_CLIP) || defined(CONFIG_ATM_CLIP_MODULE)
1024         if (dev->type == ARPHRD_ATM)
1025                 tbl = clip_tbl_hook;
1026 #endif
1027         if (dev->flags & (IFF_LOOPBACK | IFF_POINTOPOINT))
1028                 pkey = &inaddr_any;
1029
1030         n = __ipv4_neigh_lookup(tbl, dev, *(__force u32 *)pkey);
1031         if (n)
1032                 return n;
1033         return neigh_create(tbl, pkey, dev);
1034 }
1035
1036 static int rt_bind_neighbour(struct rtable *rt)
1037 {
1038         struct neighbour *n = ipv4_neigh_lookup(&rt->dst, &rt->rt_gateway);
1039         if (IS_ERR(n))
1040                 return PTR_ERR(n);
1041         dst_set_neighbour(&rt->dst, n);
1042
1043         return 0;
1044 }
1045
1046 static struct rtable *rt_intern_hash(unsigned hash, struct rtable *rt,
1047                                      struct sk_buff *skb, int ifindex)
1048 {
1049         struct rtable   *rth, *cand;
1050         struct rtable __rcu **rthp, **candp;
1051         unsigned long   now;
1052         u32             min_score;
1053         int             chain_length;
1054         int attempts = !in_softirq();
1055
1056 restart:
1057         chain_length = 0;
1058         min_score = ~(u32)0;
1059         cand = NULL;
1060         candp = NULL;
1061         now = jiffies;
1062
1063         if (!rt_caching(dev_net(rt->dst.dev))) {
1064                 /*
1065                  * If we're not caching, just tell the caller we
1066                  * were successful and don't touch the route.  The
1067                  * caller hold the sole reference to the cache entry, and
1068                  * it will be released when the caller is done with it.
1069                  * If we drop it here, the callers have no way to resolve routes
1070                  * when we're not caching.  Instead, just point *rp at rt, so
1071                  * the caller gets a single use out of the route
1072                  * Note that we do rt_free on this new route entry, so that
1073                  * once its refcount hits zero, we are still able to reap it
1074                  * (Thanks Alexey)
1075                  * Note: To avoid expensive rcu stuff for this uncached dst,
1076                  * we set DST_NOCACHE so that dst_release() can free dst without
1077                  * waiting a grace period.
1078                  */
1079
1080                 rt->dst.flags |= DST_NOCACHE;
1081                 if (rt->rt_type == RTN_UNICAST || rt_is_output_route(rt)) {
1082                         int err = rt_bind_neighbour(rt);
1083                         if (err) {
1084                                 if (net_ratelimit())
1085                                         printk(KERN_WARNING
1086                                             "Neighbour table failure & not caching routes.\n");
1087                                 ip_rt_put(rt);
1088                                 return ERR_PTR(err);
1089                         }
1090                 }
1091
1092                 goto skip_hashing;
1093         }
1094
1095         rthp = &rt_hash_table[hash].chain;
1096
1097         spin_lock_bh(rt_hash_lock_addr(hash));
1098         while ((rth = rcu_dereference_protected(*rthp,
1099                         lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
1100                 if (rt_is_expired(rth)) {
1101                         *rthp = rth->dst.rt_next;
1102                         rt_free(rth);
1103                         continue;
1104                 }
1105                 if (compare_keys(rth, rt) && compare_netns(rth, rt)) {
1106                         /* Put it first */
1107                         *rthp = rth->dst.rt_next;
1108                         /*
1109                          * Since lookup is lockfree, the deletion
1110                          * must be visible to another weakly ordered CPU before
1111                          * the insertion at the start of the hash chain.
1112                          */
1113                         rcu_assign_pointer(rth->dst.rt_next,
1114                                            rt_hash_table[hash].chain);
1115                         /*
1116                          * Since lookup is lockfree, the update writes
1117                          * must be ordered for consistency on SMP.
1118                          */
1119                         rcu_assign_pointer(rt_hash_table[hash].chain, rth);
1120
1121                         dst_use(&rth->dst, now);
1122                         spin_unlock_bh(rt_hash_lock_addr(hash));
1123
1124                         rt_drop(rt);
1125                         if (skb)
1126                                 skb_dst_set(skb, &rth->dst);
1127                         return rth;
1128                 }
1129
1130                 if (!atomic_read(&rth->dst.__refcnt)) {
1131                         u32 score = rt_score(rth);
1132
1133                         if (score <= min_score) {
1134                                 cand = rth;
1135                                 candp = rthp;
1136                                 min_score = score;
1137                         }
1138                 }
1139
1140                 chain_length++;
1141
1142                 rthp = &rth->dst.rt_next;
1143         }
1144
1145         if (cand) {
1146                 /* ip_rt_gc_elasticity used to be average length of chain
1147                  * length, when exceeded gc becomes really aggressive.
1148                  *
1149                  * The second limit is less certain. At the moment it allows
1150                  * only 2 entries per bucket. We will see.
1151                  */
1152                 if (chain_length > ip_rt_gc_elasticity) {
1153                         *candp = cand->dst.rt_next;
1154                         rt_free(cand);
1155                 }
1156         } else {
1157                 if (chain_length > rt_chain_length_max &&
1158                     slow_chain_length(rt_hash_table[hash].chain) > rt_chain_length_max) {
1159                         struct net *net = dev_net(rt->dst.dev);
1160                         int num = ++net->ipv4.current_rt_cache_rebuild_count;
1161                         if (!rt_caching(net)) {
1162                                 printk(KERN_WARNING "%s: %d rebuilds is over limit, route caching disabled\n",
1163                                         rt->dst.dev->name, num);
1164                         }
1165                         rt_emergency_hash_rebuild(net);
1166                         spin_unlock_bh(rt_hash_lock_addr(hash));
1167
1168                         hash = rt_hash(rt->rt_key_dst, rt->rt_key_src,
1169                                         ifindex, rt_genid(net));
1170                         goto restart;
1171                 }
1172         }
1173
1174         /* Try to bind route to arp only if it is output
1175            route or unicast forwarding path.
1176          */
1177         if (rt->rt_type == RTN_UNICAST || rt_is_output_route(rt)) {
1178                 int err = rt_bind_neighbour(rt);
1179                 if (err) {
1180                         spin_unlock_bh(rt_hash_lock_addr(hash));
1181
1182                         if (err != -ENOBUFS) {
1183                                 rt_drop(rt);
1184                                 return ERR_PTR(err);
1185                         }
1186
1187                         /* Neighbour tables are full and nothing
1188                            can be released. Try to shrink route cache,
1189                            it is most likely it holds some neighbour records.
1190                          */
1191                         if (attempts-- > 0) {
1192                                 int saved_elasticity = ip_rt_gc_elasticity;
1193                                 int saved_int = ip_rt_gc_min_interval;
1194                                 ip_rt_gc_elasticity     = 1;
1195                                 ip_rt_gc_min_interval   = 0;
1196                                 rt_garbage_collect(&ipv4_dst_ops);
1197                                 ip_rt_gc_min_interval   = saved_int;
1198                                 ip_rt_gc_elasticity     = saved_elasticity;
1199                                 goto restart;
1200                         }
1201
1202                         if (net_ratelimit())
1203                                 printk(KERN_WARNING "ipv4: Neighbour table overflow.\n");
1204                         rt_drop(rt);
1205                         return ERR_PTR(-ENOBUFS);
1206                 }
1207         }
1208
1209         rt->dst.rt_next = rt_hash_table[hash].chain;
1210
1211         /*
1212          * Since lookup is lockfree, we must make sure
1213          * previous writes to rt are committed to memory
1214          * before making rt visible to other CPUS.
1215          */
1216         rcu_assign_pointer(rt_hash_table[hash].chain, rt);
1217
1218         spin_unlock_bh(rt_hash_lock_addr(hash));
1219
1220 skip_hashing:
1221         if (skb)
1222                 skb_dst_set(skb, &rt->dst);
1223         return rt;
1224 }
1225
1226 static atomic_t __rt_peer_genid = ATOMIC_INIT(0);
1227
1228 static u32 rt_peer_genid(void)
1229 {
1230         return atomic_read(&__rt_peer_genid);
1231 }
1232
1233 void rt_bind_peer(struct rtable *rt, __be32 daddr, int create)
1234 {
1235         struct inet_peer *peer;
1236
1237         peer = inet_getpeer_v4(daddr, create);
1238
1239         if (peer && cmpxchg(&rt->peer, NULL, peer) != NULL)
1240                 inet_putpeer(peer);
1241         else
1242                 rt->rt_peer_genid = rt_peer_genid();
1243 }
1244
1245 /*
1246  * Peer allocation may fail only in serious out-of-memory conditions.  However
1247  * we still can generate some output.
1248  * Random ID selection looks a bit dangerous because we have no chances to
1249  * select ID being unique in a reasonable period of time.
1250  * But broken packet identifier may be better than no packet at all.
1251  */
1252 static void ip_select_fb_ident(struct iphdr *iph)
1253 {
1254         static DEFINE_SPINLOCK(ip_fb_id_lock);
1255         static u32 ip_fallback_id;
1256         u32 salt;
1257
1258         spin_lock_bh(&ip_fb_id_lock);
1259         salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
1260         iph->id = htons(salt & 0xFFFF);
1261         ip_fallback_id = salt;
1262         spin_unlock_bh(&ip_fb_id_lock);
1263 }
1264
1265 void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
1266 {
1267         struct rtable *rt = (struct rtable *) dst;
1268
1269         if (rt) {
1270                 if (rt->peer == NULL)
1271                         rt_bind_peer(rt, rt->rt_dst, 1);
1272
1273                 /* If peer is attached to destination, it is never detached,
1274                    so that we need not to grab a lock to dereference it.
1275                  */
1276                 if (rt->peer) {
1277                         iph->id = htons(inet_getid(rt->peer, more));
1278                         return;
1279                 }
1280         } else
1281                 printk(KERN_DEBUG "rt_bind_peer(0) @%p\n",
1282                        __builtin_return_address(0));
1283
1284         ip_select_fb_ident(iph);
1285 }
1286 EXPORT_SYMBOL(__ip_select_ident);
1287
1288 static void rt_del(unsigned hash, struct rtable *rt)
1289 {
1290         struct rtable __rcu **rthp;
1291         struct rtable *aux;
1292
1293         rthp = &rt_hash_table[hash].chain;
1294         spin_lock_bh(rt_hash_lock_addr(hash));
1295         ip_rt_put(rt);
1296         while ((aux = rcu_dereference_protected(*rthp,
1297                         lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
1298                 if (aux == rt || rt_is_expired(aux)) {
1299                         *rthp = aux->dst.rt_next;
1300                         rt_free(aux);
1301                         continue;
1302                 }
1303                 rthp = &aux->dst.rt_next;
1304         }
1305         spin_unlock_bh(rt_hash_lock_addr(hash));
1306 }
1307
1308 /* called in rcu_read_lock() section */
1309 void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1310                     __be32 saddr, struct net_device *dev)
1311 {
1312         int s, i;
1313         struct in_device *in_dev = __in_dev_get_rcu(dev);
1314         struct rtable *rt;
1315         __be32 skeys[2] = { saddr, 0 };
1316         int    ikeys[2] = { dev->ifindex, 0 };
1317         struct flowi4 fl4;
1318         struct inet_peer *peer;
1319         struct net *net;
1320
1321         if (!in_dev)
1322                 return;
1323
1324         net = dev_net(dev);
1325         if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
1326             ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
1327             ipv4_is_zeronet(new_gw))
1328                 goto reject_redirect;
1329
1330         if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1331                 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1332                         goto reject_redirect;
1333                 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1334                         goto reject_redirect;
1335         } else {
1336                 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
1337                         goto reject_redirect;
1338         }
1339
1340         memset(&fl4, 0, sizeof(fl4));
1341         fl4.daddr = daddr;
1342         for (s = 0; s < 2; s++) {
1343                 for (i = 0; i < 2; i++) {
1344                         fl4.flowi4_oif = ikeys[i];
1345                         fl4.saddr = skeys[s];
1346                         rt = __ip_route_output_key(net, &fl4);
1347                         if (IS_ERR(rt))
1348                                 continue;
1349
1350                         if (rt->dst.error || rt->dst.dev != dev ||
1351                             rt->rt_gateway != old_gw) {
1352                                 ip_rt_put(rt);
1353                                 continue;
1354                         }
1355
1356                         if (!rt->peer)
1357                                 rt_bind_peer(rt, rt->rt_dst, 1);
1358
1359                         peer = rt->peer;
1360                         if (peer) {
1361                                 peer->redirect_learned.a4 = new_gw;
1362                                 atomic_inc(&__rt_peer_genid);
1363                         }
1364
1365                         ip_rt_put(rt);
1366                         return;
1367                 }
1368         }
1369         return;
1370
1371 reject_redirect:
1372 #ifdef CONFIG_IP_ROUTE_VERBOSE
1373         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1374                 printk(KERN_INFO "Redirect from %pI4 on %s about %pI4 ignored.\n"
1375                         "  Advised path = %pI4 -> %pI4\n",
1376                        &old_gw, dev->name, &new_gw,
1377                        &saddr, &daddr);
1378 #endif
1379         ;
1380 }
1381
1382 static bool peer_pmtu_expired(struct inet_peer *peer)
1383 {
1384         unsigned long orig = ACCESS_ONCE(peer->pmtu_expires);
1385
1386         return orig &&
1387                time_after_eq(jiffies, orig) &&
1388                cmpxchg(&peer->pmtu_expires, orig, 0) == orig;
1389 }
1390
1391 static bool peer_pmtu_cleaned(struct inet_peer *peer)
1392 {
1393         unsigned long orig = ACCESS_ONCE(peer->pmtu_expires);
1394
1395         return orig &&
1396                cmpxchg(&peer->pmtu_expires, orig, 0) == orig;
1397 }
1398
1399 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1400 {
1401         struct rtable *rt = (struct rtable *)dst;
1402         struct dst_entry *ret = dst;
1403
1404         if (rt) {
1405                 if (dst->obsolete > 0) {
1406                         ip_rt_put(rt);
1407                         ret = NULL;
1408                 } else if (rt->rt_flags & RTCF_REDIRECTED) {
1409                         unsigned hash = rt_hash(rt->rt_key_dst, rt->rt_key_src,
1410                                                 rt->rt_oif,
1411                                                 rt_genid(dev_net(dst->dev)));
1412                         rt_del(hash, rt);
1413                         ret = NULL;
1414                 } else if (rt->peer && peer_pmtu_expired(rt->peer)) {
1415                         dst_metric_set(dst, RTAX_MTU, rt->peer->pmtu_orig);
1416                 }
1417         }
1418         return ret;
1419 }
1420
1421 /*
1422  * Algorithm:
1423  *      1. The first ip_rt_redirect_number redirects are sent
1424  *         with exponential backoff, then we stop sending them at all,
1425  *         assuming that the host ignores our redirects.
1426  *      2. If we did not see packets requiring redirects
1427  *         during ip_rt_redirect_silence, we assume that the host
1428  *         forgot redirected route and start to send redirects again.
1429  *
1430  * This algorithm is much cheaper and more intelligent than dumb load limiting
1431  * in icmp.c.
1432  *
1433  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1434  * and "frag. need" (breaks PMTU discovery) in icmp.c.
1435  */
1436
1437 void ip_rt_send_redirect(struct sk_buff *skb)
1438 {
1439         struct rtable *rt = skb_rtable(skb);
1440         struct in_device *in_dev;
1441         struct inet_peer *peer;
1442         int log_martians;
1443
1444         rcu_read_lock();
1445         in_dev = __in_dev_get_rcu(rt->dst.dev);
1446         if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
1447                 rcu_read_unlock();
1448                 return;
1449         }
1450         log_martians = IN_DEV_LOG_MARTIANS(in_dev);
1451         rcu_read_unlock();
1452
1453         if (!rt->peer)
1454                 rt_bind_peer(rt, rt->rt_dst, 1);
1455         peer = rt->peer;
1456         if (!peer) {
1457                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1458                 return;
1459         }
1460
1461         /* No redirected packets during ip_rt_redirect_silence;
1462          * reset the algorithm.
1463          */
1464         if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence))
1465                 peer->rate_tokens = 0;
1466
1467         /* Too many ignored redirects; do not send anything
1468          * set dst.rate_last to the last seen redirected packet.
1469          */
1470         if (peer->rate_tokens >= ip_rt_redirect_number) {
1471                 peer->rate_last = jiffies;
1472                 return;
1473         }
1474
1475         /* Check for load limit; set rate_last to the latest sent
1476          * redirect.
1477          */
1478         if (peer->rate_tokens == 0 ||
1479             time_after(jiffies,
1480                        (peer->rate_last +
1481                         (ip_rt_redirect_load << peer->rate_tokens)))) {
1482                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1483                 peer->rate_last = jiffies;
1484                 ++peer->rate_tokens;
1485 #ifdef CONFIG_IP_ROUTE_VERBOSE
1486                 if (log_martians &&
1487                     peer->rate_tokens == ip_rt_redirect_number &&
1488                     net_ratelimit())
1489                         printk(KERN_WARNING "host %pI4/if%d ignores redirects for %pI4 to %pI4.\n",
1490                                &ip_hdr(skb)->saddr, rt->rt_iif,
1491                                 &rt->rt_dst, &rt->rt_gateway);
1492 #endif
1493         }
1494 }
1495
1496 static int ip_error(struct sk_buff *skb)
1497 {
1498         struct rtable *rt = skb_rtable(skb);
1499         struct inet_peer *peer;
1500         unsigned long now;
1501         bool send;
1502         int code;
1503
1504         switch (rt->dst.error) {
1505         case EINVAL:
1506         default:
1507                 goto out;
1508         case EHOSTUNREACH:
1509                 code = ICMP_HOST_UNREACH;
1510                 break;
1511         case ENETUNREACH:
1512                 code = ICMP_NET_UNREACH;
1513                 IP_INC_STATS_BH(dev_net(rt->dst.dev),
1514                                 IPSTATS_MIB_INNOROUTES);
1515                 break;
1516         case EACCES:
1517                 code = ICMP_PKT_FILTERED;
1518                 break;
1519         }
1520
1521         if (!rt->peer)
1522                 rt_bind_peer(rt, rt->rt_dst, 1);
1523         peer = rt->peer;
1524
1525         send = true;
1526         if (peer) {
1527                 now = jiffies;
1528                 peer->rate_tokens += now - peer->rate_last;
1529                 if (peer->rate_tokens > ip_rt_error_burst)
1530                         peer->rate_tokens = ip_rt_error_burst;
1531                 peer->rate_last = now;
1532                 if (peer->rate_tokens >= ip_rt_error_cost)
1533                         peer->rate_tokens -= ip_rt_error_cost;
1534                 else
1535                         send = false;
1536         }
1537         if (send)
1538                 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1539
1540 out:    kfree_skb(skb);
1541         return 0;
1542 }
1543
1544 /*
1545  *      The last two values are not from the RFC but
1546  *      are needed for AMPRnet AX.25 paths.
1547  */
1548
1549 static const unsigned short mtu_plateau[] =
1550 {32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1551
1552 static inline unsigned short guess_mtu(unsigned short old_mtu)
1553 {
1554         int i;
1555
1556         for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
1557                 if (old_mtu > mtu_plateau[i])
1558                         return mtu_plateau[i];
1559         return 68;
1560 }
1561
1562 unsigned short ip_rt_frag_needed(struct net *net, const struct iphdr *iph,
1563                                  unsigned short new_mtu,
1564                                  struct net_device *dev)
1565 {
1566         unsigned short old_mtu = ntohs(iph->tot_len);
1567         unsigned short est_mtu = 0;
1568         struct inet_peer *peer;
1569
1570         peer = inet_getpeer_v4(iph->daddr, 1);
1571         if (peer) {
1572                 unsigned short mtu = new_mtu;
1573
1574                 if (new_mtu < 68 || new_mtu >= old_mtu) {
1575                         /* BSD 4.2 derived systems incorrectly adjust
1576                          * tot_len by the IP header length, and report
1577                          * a zero MTU in the ICMP message.
1578                          */
1579                         if (mtu == 0 &&
1580                             old_mtu >= 68 + (iph->ihl << 2))
1581                                 old_mtu -= iph->ihl << 2;
1582                         mtu = guess_mtu(old_mtu);
1583                 }
1584
1585                 if (mtu < ip_rt_min_pmtu)
1586                         mtu = ip_rt_min_pmtu;
1587                 if (!peer->pmtu_expires || mtu < peer->pmtu_learned) {
1588                         unsigned long pmtu_expires;
1589
1590                         pmtu_expires = jiffies + ip_rt_mtu_expires;
1591                         if (!pmtu_expires)
1592                                 pmtu_expires = 1UL;
1593
1594                         est_mtu = mtu;
1595                         peer->pmtu_learned = mtu;
1596                         peer->pmtu_expires = pmtu_expires;
1597                 }
1598
1599                 inet_putpeer(peer);
1600
1601                 atomic_inc(&__rt_peer_genid);
1602         }
1603         return est_mtu ? : new_mtu;
1604 }
1605
1606 static void check_peer_pmtu(struct dst_entry *dst, struct inet_peer *peer)
1607 {
1608         unsigned long expires = ACCESS_ONCE(peer->pmtu_expires);
1609
1610         if (!expires)
1611                 return;
1612         if (time_before(jiffies, expires)) {
1613                 u32 orig_dst_mtu = dst_mtu(dst);
1614                 if (peer->pmtu_learned < orig_dst_mtu) {
1615                         if (!peer->pmtu_orig)
1616                                 peer->pmtu_orig = dst_metric_raw(dst, RTAX_MTU);
1617                         dst_metric_set(dst, RTAX_MTU, peer->pmtu_learned);
1618                 }
1619         } else if (cmpxchg(&peer->pmtu_expires, expires, 0) == expires)
1620                 dst_metric_set(dst, RTAX_MTU, peer->pmtu_orig);
1621 }
1622
1623 static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1624 {
1625         struct rtable *rt = (struct rtable *) dst;
1626         struct inet_peer *peer;
1627
1628         dst_confirm(dst);
1629
1630         if (!rt->peer)
1631                 rt_bind_peer(rt, rt->rt_dst, 1);
1632         peer = rt->peer;
1633         if (peer) {
1634                 unsigned long pmtu_expires = ACCESS_ONCE(peer->pmtu_expires);
1635
1636                 if (mtu < ip_rt_min_pmtu)
1637                         mtu = ip_rt_min_pmtu;
1638                 if (!pmtu_expires || mtu < peer->pmtu_learned) {
1639
1640                         pmtu_expires = jiffies + ip_rt_mtu_expires;
1641                         if (!pmtu_expires)
1642                                 pmtu_expires = 1UL;
1643
1644                         peer->pmtu_learned = mtu;
1645                         peer->pmtu_expires = pmtu_expires;
1646
1647                         atomic_inc(&__rt_peer_genid);
1648                         rt->rt_peer_genid = rt_peer_genid();
1649                 }
1650                 check_peer_pmtu(dst, peer);
1651         }
1652 }
1653
1654 static int check_peer_redir(struct dst_entry *dst, struct inet_peer *peer)
1655 {
1656         struct rtable *rt = (struct rtable *) dst;
1657         __be32 orig_gw = rt->rt_gateway;
1658         struct neighbour *n, *old_n;
1659
1660         dst_confirm(&rt->dst);
1661
1662         rt->rt_gateway = peer->redirect_learned.a4;
1663
1664         n = ipv4_neigh_lookup(&rt->dst, &rt->rt_gateway);
1665         if (IS_ERR(n))
1666                 return PTR_ERR(n);
1667         old_n = xchg(&rt->dst._neighbour, n);
1668         if (old_n)
1669                 neigh_release(old_n);
1670         if (!n || !(n->nud_state & NUD_VALID)) {
1671                 if (n)
1672                         neigh_event_send(n, NULL);
1673                 rt->rt_gateway = orig_gw;
1674                 return -EAGAIN;
1675         } else {
1676                 rt->rt_flags |= RTCF_REDIRECTED;
1677                 call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
1678         }
1679         return 0;
1680 }
1681
1682 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1683 {
1684         struct rtable *rt = (struct rtable *) dst;
1685
1686         if (rt_is_expired(rt))
1687                 return NULL;
1688         if (rt->rt_peer_genid != rt_peer_genid()) {
1689                 struct inet_peer *peer;
1690
1691                 if (!rt->peer)
1692                         rt_bind_peer(rt, rt->rt_dst, 0);
1693
1694                 peer = rt->peer;
1695                 if (peer) {
1696                         check_peer_pmtu(dst, peer);
1697
1698                         if (peer->redirect_learned.a4 &&
1699                             peer->redirect_learned.a4 != rt->rt_gateway) {
1700                                 if (check_peer_redir(dst, peer))
1701                                         return NULL;
1702                         }
1703                 }
1704
1705                 rt->rt_peer_genid = rt_peer_genid();
1706         }
1707         return dst;
1708 }
1709
1710 static void ipv4_dst_destroy(struct dst_entry *dst)
1711 {
1712         struct rtable *rt = (struct rtable *) dst;
1713         struct inet_peer *peer = rt->peer;
1714
1715         if (rt->fi) {
1716                 fib_info_put(rt->fi);
1717                 rt->fi = NULL;
1718         }
1719         if (peer) {
1720                 rt->peer = NULL;
1721                 inet_putpeer(peer);
1722         }
1723 }
1724
1725
1726 static void ipv4_link_failure(struct sk_buff *skb)
1727 {
1728         struct rtable *rt;
1729
1730         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1731
1732         rt = skb_rtable(skb);
1733         if (rt && rt->peer && peer_pmtu_cleaned(rt->peer))
1734                 dst_metric_set(&rt->dst, RTAX_MTU, rt->peer->pmtu_orig);
1735 }
1736
1737 static int ip_rt_bug(struct sk_buff *skb)
1738 {
1739         printk(KERN_DEBUG "ip_rt_bug: %pI4 -> %pI4, %s\n",
1740                 &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1741                 skb->dev ? skb->dev->name : "?");
1742         kfree_skb(skb);
1743         WARN_ON(1);
1744         return 0;
1745 }
1746
1747 /*
1748    We do not cache source address of outgoing interface,
1749    because it is used only by IP RR, TS and SRR options,
1750    so that it out of fast path.
1751
1752    BTW remember: "addr" is allowed to be not aligned
1753    in IP options!
1754  */
1755
1756 void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1757 {
1758         __be32 src;
1759
1760         if (rt_is_output_route(rt))
1761                 src = ip_hdr(skb)->saddr;
1762         else {
1763                 struct fib_result res;
1764                 struct flowi4 fl4;
1765                 struct iphdr *iph;
1766
1767                 iph = ip_hdr(skb);
1768
1769                 memset(&fl4, 0, sizeof(fl4));
1770                 fl4.daddr = iph->daddr;
1771                 fl4.saddr = iph->saddr;
1772                 fl4.flowi4_tos = RT_TOS(iph->tos);
1773                 fl4.flowi4_oif = rt->dst.dev->ifindex;
1774                 fl4.flowi4_iif = skb->dev->ifindex;
1775                 fl4.flowi4_mark = skb->mark;
1776
1777                 rcu_read_lock();
1778                 if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res) == 0)
1779                         src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res);
1780                 else
1781                         src = inet_select_addr(rt->dst.dev, rt->rt_gateway,
1782                                         RT_SCOPE_UNIVERSE);
1783                 rcu_read_unlock();
1784         }
1785         memcpy(addr, &src, 4);
1786 }
1787
1788 #ifdef CONFIG_IP_ROUTE_CLASSID
1789 static void set_class_tag(struct rtable *rt, u32 tag)
1790 {
1791         if (!(rt->dst.tclassid & 0xFFFF))
1792                 rt->dst.tclassid |= tag & 0xFFFF;
1793         if (!(rt->dst.tclassid & 0xFFFF0000))
1794                 rt->dst.tclassid |= tag & 0xFFFF0000;
1795 }
1796 #endif
1797
1798 static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1799 {
1800         unsigned int advmss = dst_metric_raw(dst, RTAX_ADVMSS);
1801
1802         if (advmss == 0) {
1803                 advmss = max_t(unsigned int, dst->dev->mtu - 40,
1804                                ip_rt_min_advmss);
1805                 if (advmss > 65535 - 40)
1806                         advmss = 65535 - 40;
1807         }
1808         return advmss;
1809 }
1810
1811 static unsigned int ipv4_default_mtu(const struct dst_entry *dst)
1812 {
1813         unsigned int mtu = dst->dev->mtu;
1814
1815         if (unlikely(dst_metric_locked(dst, RTAX_MTU))) {
1816                 const struct rtable *rt = (const struct rtable *) dst;
1817
1818                 if (rt->rt_gateway != rt->rt_dst && mtu > 576)
1819                         mtu = 576;
1820         }
1821
1822         if (mtu > IP_MAX_MTU)
1823                 mtu = IP_MAX_MTU;
1824
1825         return mtu;
1826 }
1827
1828 static void rt_init_metrics(struct rtable *rt, const struct flowi4 *fl4,
1829                             struct fib_info *fi)
1830 {
1831         struct inet_peer *peer;
1832         int create = 0;
1833
1834         /* If a peer entry exists for this destination, we must hook
1835          * it up in order to get at cached metrics.
1836          */
1837         if (fl4 && (fl4->flowi4_flags & FLOWI_FLAG_PRECOW_METRICS))
1838                 create = 1;
1839
1840         rt->peer = peer = inet_getpeer_v4(rt->rt_dst, create);
1841         if (peer) {
1842                 rt->rt_peer_genid = rt_peer_genid();
1843                 if (inet_metrics_new(peer))
1844                         memcpy(peer->metrics, fi->fib_metrics,
1845                                sizeof(u32) * RTAX_MAX);
1846                 dst_init_metrics(&rt->dst, peer->metrics, false);
1847
1848                 check_peer_pmtu(&rt->dst, peer);
1849                 if (peer->redirect_learned.a4 &&
1850                     peer->redirect_learned.a4 != rt->rt_gateway) {
1851                         rt->rt_gateway = peer->redirect_learned.a4;
1852                         rt->rt_flags |= RTCF_REDIRECTED;
1853                 }
1854         } else {
1855                 if (fi->fib_metrics != (u32 *) dst_default_metrics) {
1856                         rt->fi = fi;
1857                         atomic_inc(&fi->fib_clntref);
1858                 }
1859                 dst_init_metrics(&rt->dst, fi->fib_metrics, true);
1860         }
1861 }
1862
1863 static void rt_set_nexthop(struct rtable *rt, const struct flowi4 *fl4,
1864                            const struct fib_result *res,
1865                            struct fib_info *fi, u16 type, u32 itag)
1866 {
1867         struct dst_entry *dst = &rt->dst;
1868
1869         if (fi) {
1870                 if (FIB_RES_GW(*res) &&
1871                     FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1872                         rt->rt_gateway = FIB_RES_GW(*res);
1873                 rt_init_metrics(rt, fl4, fi);
1874 #ifdef CONFIG_IP_ROUTE_CLASSID
1875                 dst->tclassid = FIB_RES_NH(*res).nh_tclassid;
1876 #endif
1877         }
1878
1879         if (dst_mtu(dst) > IP_MAX_MTU)
1880                 dst_metric_set(dst, RTAX_MTU, IP_MAX_MTU);
1881         if (dst_metric_raw(dst, RTAX_ADVMSS) > 65535 - 40)
1882                 dst_metric_set(dst, RTAX_ADVMSS, 65535 - 40);
1883
1884 #ifdef CONFIG_IP_ROUTE_CLASSID
1885 #ifdef CONFIG_IP_MULTIPLE_TABLES
1886         set_class_tag(rt, fib_rules_tclass(res));
1887 #endif
1888         set_class_tag(rt, itag);
1889 #endif
1890 }
1891
1892 static struct rtable *rt_dst_alloc(struct net_device *dev,
1893                                    bool nopolicy, bool noxfrm)
1894 {
1895         return dst_alloc(&ipv4_dst_ops, dev, 1, -1,
1896                          DST_HOST |
1897                          (nopolicy ? DST_NOPOLICY : 0) |
1898                          (noxfrm ? DST_NOXFRM : 0));
1899 }
1900
1901 /* called in rcu_read_lock() section */
1902 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1903                                 u8 tos, struct net_device *dev, int our)
1904 {
1905         unsigned int hash;
1906         struct rtable *rth;
1907         __be32 spec_dst;
1908         struct in_device *in_dev = __in_dev_get_rcu(dev);
1909         u32 itag = 0;
1910         int err;
1911
1912         /* Primary sanity checks. */
1913
1914         if (in_dev == NULL)
1915                 return -EINVAL;
1916
1917         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1918             ipv4_is_loopback(saddr) || skb->protocol != htons(ETH_P_IP))
1919                 goto e_inval;
1920
1921         if (ipv4_is_zeronet(saddr)) {
1922                 if (!ipv4_is_local_multicast(daddr))
1923                         goto e_inval;
1924                 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1925         } else {
1926                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev, &spec_dst,
1927                                           &itag);
1928                 if (err < 0)
1929                         goto e_err;
1930         }
1931         rth = rt_dst_alloc(init_net.loopback_dev,
1932                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
1933         if (!rth)
1934                 goto e_nobufs;
1935
1936 #ifdef CONFIG_IP_ROUTE_CLASSID
1937         rth->dst.tclassid = itag;
1938 #endif
1939         rth->dst.output = ip_rt_bug;
1940
1941         rth->rt_key_dst = daddr;
1942         rth->rt_key_src = saddr;
1943         rth->rt_genid   = rt_genid(dev_net(dev));
1944         rth->rt_flags   = RTCF_MULTICAST;
1945         rth->rt_type    = RTN_MULTICAST;
1946         rth->rt_key_tos = tos;
1947         rth->rt_dst     = daddr;
1948         rth->rt_src     = saddr;
1949         rth->rt_route_iif = dev->ifindex;
1950         rth->rt_iif     = dev->ifindex;
1951         rth->rt_oif     = 0;
1952         rth->rt_mark    = skb->mark;
1953         rth->rt_gateway = daddr;
1954         rth->rt_spec_dst= spec_dst;
1955         rth->rt_peer_genid = 0;
1956         rth->peer = NULL;
1957         rth->fi = NULL;
1958         if (our) {
1959                 rth->dst.input= ip_local_deliver;
1960                 rth->rt_flags |= RTCF_LOCAL;
1961         }
1962
1963 #ifdef CONFIG_IP_MROUTE
1964         if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1965                 rth->dst.input = ip_mr_input;
1966 #endif
1967         RT_CACHE_STAT_INC(in_slow_mc);
1968
1969         hash = rt_hash(daddr, saddr, dev->ifindex, rt_genid(dev_net(dev)));
1970         rth = rt_intern_hash(hash, rth, skb, dev->ifindex);
1971         return IS_ERR(rth) ? PTR_ERR(rth) : 0;
1972
1973 e_nobufs:
1974         return -ENOBUFS;
1975 e_inval:
1976         return -EINVAL;
1977 e_err:
1978         return err;
1979 }
1980
1981
1982 static void ip_handle_martian_source(struct net_device *dev,
1983                                      struct in_device *in_dev,
1984                                      struct sk_buff *skb,
1985                                      __be32 daddr,
1986                                      __be32 saddr)
1987 {
1988         RT_CACHE_STAT_INC(in_martian_src);
1989 #ifdef CONFIG_IP_ROUTE_VERBOSE
1990         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1991                 /*
1992                  *      RFC1812 recommendation, if source is martian,
1993                  *      the only hint is MAC header.
1994                  */
1995                 printk(KERN_WARNING "martian source %pI4 from %pI4, on dev %s\n",
1996                         &daddr, &saddr, dev->name);
1997                 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1998                         int i;
1999                         const unsigned char *p = skb_mac_header(skb);
2000                         printk(KERN_WARNING "ll header: ");
2001                         for (i = 0; i < dev->hard_header_len; i++, p++) {
2002                                 printk("%02x", *p);
2003                                 if (i < (dev->hard_header_len - 1))
2004                                         printk(":");
2005                         }
2006                         printk("\n");
2007                 }
2008         }
2009 #endif
2010 }
2011
2012 /* called in rcu_read_lock() section */
2013 static int __mkroute_input(struct sk_buff *skb,
2014                            const struct fib_result *res,
2015                            struct in_device *in_dev,
2016                            __be32 daddr, __be32 saddr, u32 tos,
2017                            struct rtable **result)
2018 {
2019         struct rtable *rth;
2020         int err;
2021         struct in_device *out_dev;
2022         unsigned int flags = 0;
2023         __be32 spec_dst;
2024         u32 itag;
2025
2026         /* get a working reference to the output device */
2027         out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
2028         if (out_dev == NULL) {
2029                 if (net_ratelimit())
2030                         printk(KERN_CRIT "Bug in ip_route_input" \
2031                                "_slow(). Please, report\n");
2032                 return -EINVAL;
2033         }
2034
2035
2036         err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
2037                                   in_dev->dev, &spec_dst, &itag);
2038         if (err < 0) {
2039                 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
2040                                          saddr);
2041
2042                 goto cleanup;
2043         }
2044
2045         if (err)
2046                 flags |= RTCF_DIRECTSRC;
2047
2048         if (out_dev == in_dev && err &&
2049             (IN_DEV_SHARED_MEDIA(out_dev) ||
2050              inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
2051                 flags |= RTCF_DOREDIRECT;
2052
2053         if (skb->protocol != htons(ETH_P_IP)) {
2054                 /* Not IP (i.e. ARP). Do not create route, if it is
2055                  * invalid for proxy arp. DNAT routes are always valid.
2056                  *
2057                  * Proxy arp feature have been extended to allow, ARP
2058                  * replies back to the same interface, to support
2059                  * Private VLAN switch technologies. See arp.c.
2060                  */
2061                 if (out_dev == in_dev &&
2062                     IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
2063                         err = -EINVAL;
2064                         goto cleanup;
2065                 }
2066         }
2067
2068         rth = rt_dst_alloc(out_dev->dev,
2069                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
2070                            IN_DEV_CONF_GET(out_dev, NOXFRM));
2071         if (!rth) {
2072                 err = -ENOBUFS;
2073                 goto cleanup;
2074         }
2075
2076         rth->rt_key_dst = daddr;
2077         rth->rt_key_src = saddr;
2078         rth->rt_genid = rt_genid(dev_net(rth->dst.dev));
2079         rth->rt_flags = flags;
2080         rth->rt_type = res->type;
2081         rth->rt_key_tos = tos;
2082         rth->rt_dst     = daddr;
2083         rth->rt_src     = saddr;
2084         rth->rt_route_iif = in_dev->dev->ifindex;
2085         rth->rt_iif     = in_dev->dev->ifindex;
2086         rth->rt_oif     = 0;
2087         rth->rt_mark    = skb->mark;
2088         rth->rt_gateway = daddr;
2089         rth->rt_spec_dst= spec_dst;
2090         rth->rt_peer_genid = 0;
2091         rth->peer = NULL;
2092         rth->fi = NULL;
2093
2094         rth->dst.input = ip_forward;
2095         rth->dst.output = ip_output;
2096
2097         rt_set_nexthop(rth, NULL, res, res->fi, res->type, itag);
2098
2099         *result = rth;
2100         err = 0;
2101  cleanup:
2102         return err;
2103 }
2104
2105 static int ip_mkroute_input(struct sk_buff *skb,
2106                             struct fib_result *res,
2107                             const struct flowi4 *fl4,
2108                             struct in_device *in_dev,
2109                             __be32 daddr, __be32 saddr, u32 tos)
2110 {
2111         struct rtable* rth = NULL;
2112         int err;
2113         unsigned hash;
2114
2115 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2116         if (res->fi && res->fi->fib_nhs > 1)
2117                 fib_select_multipath(res);
2118 #endif
2119
2120         /* create a routing cache entry */
2121         err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
2122         if (err)
2123                 return err;
2124
2125         /* put it into the cache */
2126         hash = rt_hash(daddr, saddr, fl4->flowi4_iif,
2127                        rt_genid(dev_net(rth->dst.dev)));
2128         rth = rt_intern_hash(hash, rth, skb, fl4->flowi4_iif);
2129         if (IS_ERR(rth))
2130                 return PTR_ERR(rth);
2131         return 0;
2132 }
2133
2134 /*
2135  *      NOTE. We drop all the packets that has local source
2136  *      addresses, because every properly looped back packet
2137  *      must have correct destination already attached by output routine.
2138  *
2139  *      Such approach solves two big problems:
2140  *      1. Not simplex devices are handled properly.
2141  *      2. IP spoofing attempts are filtered with 100% of guarantee.
2142  *      called with rcu_read_lock()
2143  */
2144
2145 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2146                                u8 tos, struct net_device *dev)
2147 {
2148         struct fib_result res;
2149         struct in_device *in_dev = __in_dev_get_rcu(dev);
2150         struct flowi4   fl4;
2151         unsigned        flags = 0;
2152         u32             itag = 0;
2153         struct rtable * rth;
2154         unsigned        hash;
2155         __be32          spec_dst;
2156         int             err = -EINVAL;
2157         struct net    * net = dev_net(dev);
2158
2159         /* IP on this device is disabled. */
2160
2161         if (!in_dev)
2162                 goto out;
2163
2164         /* Check for the most weird martians, which can be not detected
2165            by fib_lookup.
2166          */
2167
2168         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
2169             ipv4_is_loopback(saddr))
2170                 goto martian_source;
2171
2172         if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
2173                 goto brd_input;
2174
2175         /* Accept zero addresses only to limited broadcast;
2176          * I even do not know to fix it or not. Waiting for complains :-)
2177          */
2178         if (ipv4_is_zeronet(saddr))
2179                 goto martian_source;
2180
2181         if (ipv4_is_zeronet(daddr) || ipv4_is_loopback(daddr))
2182                 goto martian_destination;
2183
2184         /*
2185          *      Now we are ready to route packet.
2186          */
2187         fl4.flowi4_oif = 0;
2188         fl4.flowi4_iif = dev->ifindex;
2189         fl4.flowi4_mark = skb->mark;
2190         fl4.flowi4_tos = tos;
2191         fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
2192         fl4.daddr = daddr;
2193         fl4.saddr = saddr;
2194         err = fib_lookup(net, &fl4, &res);
2195         if (err != 0) {
2196                 if (!IN_DEV_FORWARD(in_dev))
2197                         goto e_hostunreach;
2198                 goto no_route;
2199         }
2200
2201         RT_CACHE_STAT_INC(in_slow_tot);
2202
2203         if (res.type == RTN_BROADCAST)
2204                 goto brd_input;
2205
2206         if (res.type == RTN_LOCAL) {
2207                 err = fib_validate_source(skb, saddr, daddr, tos,
2208                                           net->loopback_dev->ifindex,
2209                                           dev, &spec_dst, &itag);
2210                 if (err < 0)
2211                         goto martian_source_keep_err;
2212                 if (err)
2213                         flags |= RTCF_DIRECTSRC;
2214                 spec_dst = daddr;
2215                 goto local_input;
2216         }
2217
2218         if (!IN_DEV_FORWARD(in_dev))
2219                 goto e_hostunreach;
2220         if (res.type != RTN_UNICAST)
2221                 goto martian_destination;
2222
2223         err = ip_mkroute_input(skb, &res, &fl4, in_dev, daddr, saddr, tos);
2224 out:    return err;
2225
2226 brd_input:
2227         if (skb->protocol != htons(ETH_P_IP))
2228                 goto e_inval;
2229
2230         if (ipv4_is_zeronet(saddr))
2231                 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
2232         else {
2233                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev, &spec_dst,
2234                                           &itag);
2235                 if (err < 0)
2236                         goto martian_source_keep_err;
2237                 if (err)
2238                         flags |= RTCF_DIRECTSRC;
2239         }
2240         flags |= RTCF_BROADCAST;
2241         res.type = RTN_BROADCAST;
2242         RT_CACHE_STAT_INC(in_brd);
2243
2244 local_input:
2245         rth = rt_dst_alloc(net->loopback_dev,
2246                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
2247         if (!rth)
2248                 goto e_nobufs;
2249
2250         rth->dst.input= ip_local_deliver;
2251         rth->dst.output= ip_rt_bug;
2252 #ifdef CONFIG_IP_ROUTE_CLASSID
2253         rth->dst.tclassid = itag;
2254 #endif
2255
2256         rth->rt_key_dst = daddr;
2257         rth->rt_key_src = saddr;
2258         rth->rt_genid = rt_genid(net);
2259         rth->rt_flags   = flags|RTCF_LOCAL;
2260         rth->rt_type    = res.type;
2261         rth->rt_key_tos = tos;
2262         rth->rt_dst     = daddr;
2263         rth->rt_src     = saddr;
2264 #ifdef CONFIG_IP_ROUTE_CLASSID
2265         rth->dst.tclassid = itag;
2266 #endif
2267         rth->rt_route_iif = dev->ifindex;
2268         rth->rt_iif     = dev->ifindex;
2269         rth->rt_oif     = 0;
2270         rth->rt_mark    = skb->mark;
2271         rth->rt_gateway = daddr;
2272         rth->rt_spec_dst= spec_dst;
2273         rth->rt_peer_genid = 0;
2274         rth->peer = NULL;
2275         rth->fi = NULL;
2276         if (res.type == RTN_UNREACHABLE) {
2277                 rth->dst.input= ip_error;
2278                 rth->dst.error= -err;
2279                 rth->rt_flags   &= ~RTCF_LOCAL;
2280         }
2281         hash = rt_hash(daddr, saddr, fl4.flowi4_iif, rt_genid(net));
2282         rth = rt_intern_hash(hash, rth, skb, fl4.flowi4_iif);
2283         err = 0;
2284         if (IS_ERR(rth))
2285                 err = PTR_ERR(rth);
2286         goto out;
2287
2288 no_route:
2289         RT_CACHE_STAT_INC(in_no_route);
2290         spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
2291         res.type = RTN_UNREACHABLE;
2292         if (err == -ESRCH)
2293                 err = -ENETUNREACH;
2294         goto local_input;
2295
2296         /*
2297          *      Do not cache martian addresses: they should be logged (RFC1812)
2298          */
2299 martian_destination:
2300         RT_CACHE_STAT_INC(in_martian_dst);
2301 #ifdef CONFIG_IP_ROUTE_VERBOSE
2302         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
2303                 printk(KERN_WARNING "martian destination %pI4 from %pI4, dev %s\n",
2304                         &daddr, &saddr, dev->name);
2305 #endif
2306
2307 e_hostunreach:
2308         err = -EHOSTUNREACH;
2309         goto out;
2310
2311 e_inval:
2312         err = -EINVAL;
2313         goto out;
2314
2315 e_nobufs:
2316         err = -ENOBUFS;
2317         goto out;
2318
2319 martian_source:
2320         err = -EINVAL;
2321 martian_source_keep_err:
2322         ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2323         goto out;
2324 }
2325
2326 int ip_route_input_common(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2327                            u8 tos, struct net_device *dev, bool noref)
2328 {
2329         struct rtable * rth;
2330         unsigned        hash;
2331         int iif = dev->ifindex;
2332         struct net *net;
2333         int res;
2334
2335         net = dev_net(dev);
2336
2337         rcu_read_lock();
2338
2339         if (!rt_caching(net))
2340                 goto skip_cache;
2341
2342         tos &= IPTOS_RT_MASK;
2343         hash = rt_hash(daddr, saddr, iif, rt_genid(net));
2344
2345         for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2346              rth = rcu_dereference(rth->dst.rt_next)) {
2347                 if ((((__force u32)rth->rt_key_dst ^ (__force u32)daddr) |
2348                      ((__force u32)rth->rt_key_src ^ (__force u32)saddr) |
2349                      (rth->rt_route_iif ^ iif) |
2350                      (rth->rt_key_tos ^ tos)) == 0 &&
2351                     rth->rt_mark == skb->mark &&
2352                     net_eq(dev_net(rth->dst.dev), net) &&
2353                     !rt_is_expired(rth)) {
2354                         if (noref) {
2355                                 dst_use_noref(&rth->dst, jiffies);
2356                                 skb_dst_set_noref(skb, &rth->dst);
2357                         } else {
2358                                 dst_use(&rth->dst, jiffies);
2359                                 skb_dst_set(skb, &rth->dst);
2360                         }
2361                         RT_CACHE_STAT_INC(in_hit);
2362                         rcu_read_unlock();
2363                         return 0;
2364                 }
2365                 RT_CACHE_STAT_INC(in_hlist_search);
2366         }
2367
2368 skip_cache:
2369         /* Multicast recognition logic is moved from route cache to here.
2370            The problem was that too many Ethernet cards have broken/missing
2371            hardware multicast filters :-( As result the host on multicasting
2372            network acquires a lot of useless route cache entries, sort of
2373            SDR messages from all the world. Now we try to get rid of them.
2374            Really, provided software IP multicast filter is organized
2375            reasonably (at least, hashed), it does not result in a slowdown
2376            comparing with route cache reject entries.
2377            Note, that multicast routers are not affected, because
2378            route cache entry is created eventually.
2379          */
2380         if (ipv4_is_multicast(daddr)) {
2381                 struct in_device *in_dev = __in_dev_get_rcu(dev);
2382
2383                 if (in_dev) {
2384                         int our = ip_check_mc_rcu(in_dev, daddr, saddr,
2385                                                   ip_hdr(skb)->protocol);
2386                         if (our
2387 #ifdef CONFIG_IP_MROUTE
2388                                 ||
2389                             (!ipv4_is_local_multicast(daddr) &&
2390                              IN_DEV_MFORWARD(in_dev))
2391 #endif
2392                            ) {
2393                                 int res = ip_route_input_mc(skb, daddr, saddr,
2394                                                             tos, dev, our);
2395                                 rcu_read_unlock();
2396                                 return res;
2397                         }
2398                 }
2399                 rcu_read_unlock();
2400                 return -EINVAL;
2401         }
2402         res = ip_route_input_slow(skb, daddr, saddr, tos, dev);
2403         rcu_read_unlock();
2404         return res;
2405 }
2406 EXPORT_SYMBOL(ip_route_input_common);
2407
2408 /* called with rcu_read_lock() */
2409 static struct rtable *__mkroute_output(const struct fib_result *res,
2410                                        const struct flowi4 *fl4,
2411                                        __be32 orig_daddr, __be32 orig_saddr,
2412                                        int orig_oif, struct net_device *dev_out,
2413                                        unsigned int flags)
2414 {
2415         struct fib_info *fi = res->fi;
2416         u32 tos = RT_FL_TOS(fl4);
2417         struct in_device *in_dev;
2418         u16 type = res->type;
2419         struct rtable *rth;
2420
2421         if (ipv4_is_loopback(fl4->saddr) && !(dev_out->flags & IFF_LOOPBACK))
2422                 return ERR_PTR(-EINVAL);
2423
2424         if (ipv4_is_lbcast(fl4->daddr))
2425                 type = RTN_BROADCAST;
2426         else if (ipv4_is_multicast(fl4->daddr))
2427                 type = RTN_MULTICAST;
2428         else if (ipv4_is_zeronet(fl4->daddr))
2429                 return ERR_PTR(-EINVAL);
2430
2431         if (dev_out->flags & IFF_LOOPBACK)
2432                 flags |= RTCF_LOCAL;
2433
2434         in_dev = __in_dev_get_rcu(dev_out);
2435         if (!in_dev)
2436                 return ERR_PTR(-EINVAL);
2437
2438         if (type == RTN_BROADCAST) {
2439                 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2440                 fi = NULL;
2441         } else if (type == RTN_MULTICAST) {
2442                 flags |= RTCF_MULTICAST | RTCF_LOCAL;
2443                 if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
2444                                      fl4->flowi4_proto))
2445                         flags &= ~RTCF_LOCAL;
2446                 /* If multicast route do not exist use
2447                  * default one, but do not gateway in this case.
2448                  * Yes, it is hack.
2449                  */
2450                 if (fi && res->prefixlen < 4)
2451                         fi = NULL;
2452         }
2453
2454         rth = rt_dst_alloc(dev_out,
2455                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
2456                            IN_DEV_CONF_GET(in_dev, NOXFRM));
2457         if (!rth)
2458                 return ERR_PTR(-ENOBUFS);
2459
2460         rth->dst.output = ip_output;
2461
2462         rth->rt_key_dst = orig_daddr;
2463         rth->rt_key_src = orig_saddr;
2464         rth->rt_genid = rt_genid(dev_net(dev_out));
2465         rth->rt_flags   = flags;
2466         rth->rt_type    = type;
2467         rth->rt_key_tos = tos;
2468         rth->rt_dst     = fl4->daddr;
2469         rth->rt_src     = fl4->saddr;
2470         rth->rt_route_iif = 0;
2471         rth->rt_iif     = orig_oif ? : dev_out->ifindex;
2472         rth->rt_oif     = orig_oif;
2473         rth->rt_mark    = fl4->flowi4_mark;
2474         rth->rt_gateway = fl4->daddr;
2475         rth->rt_spec_dst= fl4->saddr;
2476         rth->rt_peer_genid = 0;
2477         rth->peer = NULL;
2478         rth->fi = NULL;
2479
2480         RT_CACHE_STAT_INC(out_slow_tot);
2481
2482         if (flags & RTCF_LOCAL) {
2483                 rth->dst.input = ip_local_deliver;
2484                 rth->rt_spec_dst = fl4->daddr;
2485         }
2486         if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2487                 rth->rt_spec_dst = fl4->saddr;
2488                 if (flags & RTCF_LOCAL &&
2489                     !(dev_out->flags & IFF_LOOPBACK)) {
2490                         rth->dst.output = ip_mc_output;
2491                         RT_CACHE_STAT_INC(out_slow_mc);
2492                 }
2493 #ifdef CONFIG_IP_MROUTE
2494                 if (type == RTN_MULTICAST) {
2495                         if (IN_DEV_MFORWARD(in_dev) &&
2496                             !ipv4_is_local_multicast(fl4->daddr)) {
2497                                 rth->dst.input = ip_mr_input;
2498                                 rth->dst.output = ip_mc_output;
2499                         }
2500                 }
2501 #endif
2502         }
2503
2504         rt_set_nexthop(rth, fl4, res, fi, type, 0);
2505
2506         return rth;
2507 }
2508
2509 /*
2510  * Major route resolver routine.
2511  * called with rcu_read_lock();
2512  */
2513
2514 static struct rtable *ip_route_output_slow(struct net *net, struct flowi4 *fl4)
2515 {
2516         struct net_device *dev_out = NULL;
2517         u32 tos = RT_FL_TOS(fl4);
2518         unsigned int flags = 0;
2519         struct fib_result res;
2520         struct rtable *rth;
2521         __be32 orig_daddr;
2522         __be32 orig_saddr;
2523         int orig_oif;
2524
2525         res.fi          = NULL;
2526 #ifdef CONFIG_IP_MULTIPLE_TABLES
2527         res.r           = NULL;
2528 #endif
2529
2530         orig_daddr = fl4->daddr;
2531         orig_saddr = fl4->saddr;
2532         orig_oif = fl4->flowi4_oif;
2533
2534         fl4->flowi4_iif = net->loopback_dev->ifindex;
2535         fl4->flowi4_tos = tos & IPTOS_RT_MASK;
2536         fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
2537                          RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
2538
2539         rcu_read_lock();
2540         if (fl4->saddr) {
2541                 rth = ERR_PTR(-EINVAL);
2542                 if (ipv4_is_multicast(fl4->saddr) ||
2543                     ipv4_is_lbcast(fl4->saddr) ||
2544                     ipv4_is_zeronet(fl4->saddr))
2545                         goto out;
2546
2547                 /* I removed check for oif == dev_out->oif here.
2548                    It was wrong for two reasons:
2549                    1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2550                       is assigned to multiple interfaces.
2551                    2. Moreover, we are allowed to send packets with saddr
2552                       of another iface. --ANK
2553                  */
2554
2555                 if (fl4->flowi4_oif == 0 &&
2556                     (ipv4_is_multicast(fl4->daddr) ||
2557                      ipv4_is_lbcast(fl4->daddr))) {
2558                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2559                         dev_out = __ip_dev_find(net, fl4->saddr, false);
2560                         if (dev_out == NULL)
2561                                 goto out;
2562
2563                         /* Special hack: user can direct multicasts
2564                            and limited broadcast via necessary interface
2565                            without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2566                            This hack is not just for fun, it allows
2567                            vic,vat and friends to work.
2568                            They bind socket to loopback, set ttl to zero
2569                            and expect that it will work.
2570                            From the viewpoint of routing cache they are broken,
2571                            because we are not allowed to build multicast path
2572                            with loopback source addr (look, routing cache
2573                            cannot know, that ttl is zero, so that packet
2574                            will not leave this host and route is valid).
2575                            Luckily, this hack is good workaround.
2576                          */
2577
2578                         fl4->flowi4_oif = dev_out->ifindex;
2579                         goto make_route;
2580                 }
2581
2582                 if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
2583                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2584                         if (!__ip_dev_find(net, fl4->saddr, false))
2585                                 goto out;
2586                 }
2587         }
2588
2589
2590         if (fl4->flowi4_oif) {
2591                 dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
2592                 rth = ERR_PTR(-ENODEV);
2593                 if (dev_out == NULL)
2594                         goto out;
2595
2596                 /* RACE: Check return value of inet_select_addr instead. */
2597                 if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
2598                         rth = ERR_PTR(-ENETUNREACH);
2599                         goto out;
2600                 }
2601                 if (ipv4_is_local_multicast(fl4->daddr) ||
2602                     ipv4_is_lbcast(fl4->daddr)) {
2603                         if (!fl4->saddr)
2604                                 fl4->saddr = inet_select_addr(dev_out, 0,
2605                                                               RT_SCOPE_LINK);
2606                         goto make_route;
2607                 }
2608                 if (fl4->saddr) {
2609                         if (ipv4_is_multicast(fl4->daddr))
2610                                 fl4->saddr = inet_select_addr(dev_out, 0,
2611                                                               fl4->flowi4_scope);
2612                         else if (!fl4->daddr)
2613                                 fl4->saddr = inet_select_addr(dev_out, 0,
2614                                                               RT_SCOPE_HOST);
2615                 }
2616         }
2617
2618         if (!fl4->daddr) {
2619                 fl4->daddr = fl4->saddr;
2620                 if (!fl4->daddr)
2621                         fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
2622                 dev_out = net->loopback_dev;
2623                 fl4->flowi4_oif = net->loopback_dev->ifindex;
2624                 res.type = RTN_LOCAL;
2625                 flags |= RTCF_LOCAL;
2626                 goto make_route;
2627         }
2628
2629         if (fib_lookup(net, fl4, &res)) {
2630                 res.fi = NULL;
2631                 if (fl4->flowi4_oif) {
2632                         /* Apparently, routing tables are wrong. Assume,
2633                            that the destination is on link.
2634
2635                            WHY? DW.
2636                            Because we are allowed to send to iface
2637                            even if it has NO routes and NO assigned
2638                            addresses. When oif is specified, routing
2639                            tables are looked up with only one purpose:
2640                            to catch if destination is gatewayed, rather than
2641                            direct. Moreover, if MSG_DONTROUTE is set,
2642                            we send packet, ignoring both routing tables
2643                            and ifaddr state. --ANK
2644
2645
2646                            We could make it even if oif is unknown,
2647                            likely IPv6, but we do not.
2648                          */
2649
2650                         if (fl4->saddr == 0)
2651                                 fl4->saddr = inet_select_addr(dev_out, 0,
2652                                                               RT_SCOPE_LINK);
2653                         res.type = RTN_UNICAST;
2654                         goto make_route;
2655                 }
2656                 rth = ERR_PTR(-ENETUNREACH);
2657                 goto out;
2658         }
2659
2660         if (res.type == RTN_LOCAL) {
2661                 if (!fl4->saddr) {
2662                         if (res.fi->fib_prefsrc)
2663                                 fl4->saddr = res.fi->fib_prefsrc;
2664                         else
2665                                 fl4->saddr = fl4->daddr;
2666                 }
2667                 dev_out = net->loopback_dev;
2668                 fl4->flowi4_oif = dev_out->ifindex;
2669                 res.fi = NULL;
2670                 flags |= RTCF_LOCAL;
2671                 goto make_route;
2672         }
2673
2674 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2675         if (res.fi->fib_nhs > 1 && fl4->flowi4_oif == 0)
2676                 fib_select_multipath(&res);
2677         else
2678 #endif
2679         if (!res.prefixlen &&
2680             res.table->tb_num_default > 1 &&
2681             res.type == RTN_UNICAST && !fl4->flowi4_oif)
2682                 fib_select_default(&res);
2683
2684         if (!fl4->saddr)
2685                 fl4->saddr = FIB_RES_PREFSRC(net, res);
2686
2687         dev_out = FIB_RES_DEV(res);
2688         fl4->flowi4_oif = dev_out->ifindex;
2689
2690
2691 make_route:
2692         rth = __mkroute_output(&res, fl4, orig_daddr, orig_saddr, orig_oif,
2693                                dev_out, flags);
2694         if (!IS_ERR(rth)) {
2695                 unsigned int hash;
2696
2697                 hash = rt_hash(orig_daddr, orig_saddr, orig_oif,
2698                                rt_genid(dev_net(dev_out)));
2699                 rth = rt_intern_hash(hash, rth, NULL, orig_oif);
2700         }
2701
2702 out:
2703         rcu_read_unlock();
2704         return rth;
2705 }
2706
2707 struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *flp4)
2708 {
2709         struct rtable *rth;
2710         unsigned int hash;
2711
2712         if (!rt_caching(net))
2713                 goto slow_output;
2714
2715         hash = rt_hash(flp4->daddr, flp4->saddr, flp4->flowi4_oif, rt_genid(net));
2716
2717         rcu_read_lock_bh();
2718         for (rth = rcu_dereference_bh(rt_hash_table[hash].chain); rth;
2719                 rth = rcu_dereference_bh(rth->dst.rt_next)) {
2720                 if (rth->rt_key_dst == flp4->daddr &&
2721                     rth->rt_key_src == flp4->saddr &&
2722                     rt_is_output_route(rth) &&
2723                     rth->rt_oif == flp4->flowi4_oif &&
2724                     rth->rt_mark == flp4->flowi4_mark &&
2725                     !((rth->rt_key_tos ^ flp4->flowi4_tos) &
2726                             (IPTOS_RT_MASK | RTO_ONLINK)) &&
2727                     net_eq(dev_net(rth->dst.dev), net) &&
2728                     !rt_is_expired(rth)) {
2729                         dst_use(&rth->dst, jiffies);
2730                         RT_CACHE_STAT_INC(out_hit);
2731                         rcu_read_unlock_bh();
2732                         if (!flp4->saddr)
2733                                 flp4->saddr = rth->rt_src;
2734                         if (!flp4->daddr)
2735                                 flp4->daddr = rth->rt_dst;
2736                         return rth;
2737                 }
2738                 RT_CACHE_STAT_INC(out_hlist_search);
2739         }
2740         rcu_read_unlock_bh();
2741
2742 slow_output:
2743         return ip_route_output_slow(net, flp4);
2744 }
2745 EXPORT_SYMBOL_GPL(__ip_route_output_key);
2746
2747 static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2748 {
2749         return NULL;
2750 }
2751
2752 static unsigned int ipv4_blackhole_default_mtu(const struct dst_entry *dst)
2753 {
2754         return 0;
2755 }
2756
2757 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
2758 {
2759 }
2760
2761 static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2762                                           unsigned long old)
2763 {
2764         return NULL;
2765 }
2766
2767 static struct dst_ops ipv4_dst_blackhole_ops = {
2768         .family                 =       AF_INET,
2769         .protocol               =       cpu_to_be16(ETH_P_IP),
2770         .destroy                =       ipv4_dst_destroy,
2771         .check                  =       ipv4_blackhole_dst_check,
2772         .default_mtu            =       ipv4_blackhole_default_mtu,
2773         .default_advmss         =       ipv4_default_advmss,
2774         .update_pmtu            =       ipv4_rt_blackhole_update_pmtu,
2775         .cow_metrics            =       ipv4_rt_blackhole_cow_metrics,
2776         .neigh_lookup           =       ipv4_neigh_lookup,
2777 };
2778
2779 struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2780 {
2781         struct rtable *rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, 0, 0);
2782         struct rtable *ort = (struct rtable *) dst_orig;
2783
2784         if (rt) {
2785                 struct dst_entry *new = &rt->dst;
2786
2787                 new->__use = 1;
2788                 new->input = dst_discard;
2789                 new->output = dst_discard;
2790                 dst_copy_metrics(new, &ort->dst);
2791
2792                 new->dev = ort->dst.dev;
2793                 if (new->dev)
2794                         dev_hold(new->dev);
2795
2796                 rt->rt_key_dst = ort->rt_key_dst;
2797                 rt->rt_key_src = ort->rt_key_src;
2798                 rt->rt_key_tos = ort->rt_key_tos;
2799                 rt->rt_route_iif = ort->rt_route_iif;
2800                 rt->rt_iif = ort->rt_iif;
2801                 rt->rt_oif = ort->rt_oif;
2802                 rt->rt_mark = ort->rt_mark;
2803
2804                 rt->rt_genid = rt_genid(net);
2805                 rt->rt_flags = ort->rt_flags;
2806                 rt->rt_type = ort->rt_type;
2807                 rt->rt_dst = ort->rt_dst;
2808                 rt->rt_src = ort->rt_src;
2809                 rt->rt_gateway = ort->rt_gateway;
2810                 rt->rt_spec_dst = ort->rt_spec_dst;
2811                 rt->peer = ort->peer;
2812                 if (rt->peer)
2813                         atomic_inc(&rt->peer->refcnt);
2814                 rt->fi = ort->fi;
2815                 if (rt->fi)
2816                         atomic_inc(&rt->fi->fib_clntref);
2817
2818                 dst_free(new);
2819         }
2820
2821         dst_release(dst_orig);
2822
2823         return rt ? &rt->dst : ERR_PTR(-ENOMEM);
2824 }
2825
2826 struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2827                                     struct sock *sk)
2828 {
2829         struct rtable *rt = __ip_route_output_key(net, flp4);
2830
2831         if (IS_ERR(rt))
2832                 return rt;
2833
2834         if (flp4->flowi4_proto)
2835                 rt = (struct rtable *) xfrm_lookup(net, &rt->dst,
2836                                                    flowi4_to_flowi(flp4),
2837                                                    sk, 0);
2838
2839         return rt;
2840 }
2841 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2842
2843 static int rt_fill_info(struct net *net,
2844                         struct sk_buff *skb, u32 pid, u32 seq, int event,
2845                         int nowait, unsigned int flags)
2846 {
2847         struct rtable *rt = skb_rtable(skb);
2848         struct rtmsg *r;
2849         struct nlmsghdr *nlh;
2850         long expires = 0;
2851         const struct inet_peer *peer = rt->peer;
2852         u32 id = 0, ts = 0, tsage = 0, error;
2853
2854         nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
2855         if (nlh == NULL)
2856                 return -EMSGSIZE;
2857
2858         r = nlmsg_data(nlh);
2859         r->rtm_family    = AF_INET;
2860         r->rtm_dst_len  = 32;
2861         r->rtm_src_len  = 0;
2862         r->rtm_tos      = rt->rt_key_tos;
2863         r->rtm_table    = RT_TABLE_MAIN;
2864         NLA_PUT_U32(skb, RTA_TABLE, RT_TABLE_MAIN);
2865         r->rtm_type     = rt->rt_type;
2866         r->rtm_scope    = RT_SCOPE_UNIVERSE;
2867         r->rtm_protocol = RTPROT_UNSPEC;
2868         r->rtm_flags    = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2869         if (rt->rt_flags & RTCF_NOTIFY)
2870                 r->rtm_flags |= RTM_F_NOTIFY;
2871
2872         NLA_PUT_BE32(skb, RTA_DST, rt->rt_dst);
2873
2874         if (rt->rt_key_src) {
2875                 r->rtm_src_len = 32;
2876                 NLA_PUT_BE32(skb, RTA_SRC, rt->rt_key_src);
2877         }
2878         if (rt->dst.dev)
2879                 NLA_PUT_U32(skb, RTA_OIF, rt->dst.dev->ifindex);
2880 #ifdef CONFIG_IP_ROUTE_CLASSID
2881         if (rt->dst.tclassid)
2882                 NLA_PUT_U32(skb, RTA_FLOW, rt->dst.tclassid);
2883 #endif
2884         if (rt_is_input_route(rt))
2885                 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_spec_dst);
2886         else if (rt->rt_src != rt->rt_key_src)
2887                 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_src);
2888
2889         if (rt->rt_dst != rt->rt_gateway)
2890                 NLA_PUT_BE32(skb, RTA_GATEWAY, rt->rt_gateway);
2891
2892         if (rtnetlink_put_metrics(skb, dst_metrics_ptr(&rt->dst)) < 0)
2893                 goto nla_put_failure;
2894
2895         if (rt->rt_mark)
2896                 NLA_PUT_BE32(skb, RTA_MARK, rt->rt_mark);
2897
2898         error = rt->dst.error;
2899         if (peer) {
2900                 inet_peer_refcheck(rt->peer);
2901                 id = atomic_read(&peer->ip_id_count) & 0xffff;
2902                 if (peer->tcp_ts_stamp) {
2903                         ts = peer->tcp_ts;
2904                         tsage = get_seconds() - peer->tcp_ts_stamp;
2905                 }
2906                 expires = ACCESS_ONCE(peer->pmtu_expires);
2907                 if (expires)
2908                         expires -= jiffies;
2909         }
2910
2911         if (rt_is_input_route(rt)) {
2912 #ifdef CONFIG_IP_MROUTE
2913                 __be32 dst = rt->rt_dst;
2914
2915                 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
2916                     IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2917                         int err = ipmr_get_route(net, skb,
2918                                                  rt->rt_src, rt->rt_dst,
2919                                                  r, nowait);
2920                         if (err <= 0) {
2921                                 if (!nowait) {
2922                                         if (err == 0)
2923                                                 return 0;
2924                                         goto nla_put_failure;
2925                                 } else {
2926                                         if (err == -EMSGSIZE)
2927                                                 goto nla_put_failure;
2928                                         error = err;
2929                                 }
2930                         }
2931                 } else
2932 #endif
2933                         NLA_PUT_U32(skb, RTA_IIF, rt->rt_iif);
2934         }
2935
2936         if (rtnl_put_cacheinfo(skb, &rt->dst, id, ts, tsage,
2937                                expires, error) < 0)
2938                 goto nla_put_failure;
2939
2940         return nlmsg_end(skb, nlh);
2941
2942 nla_put_failure:
2943         nlmsg_cancel(skb, nlh);
2944         return -EMSGSIZE;
2945 }
2946
2947 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2948 {
2949         struct net *net = sock_net(in_skb->sk);
2950         struct rtmsg *rtm;
2951         struct nlattr *tb[RTA_MAX+1];
2952         struct rtable *rt = NULL;
2953         __be32 dst = 0;
2954         __be32 src = 0;
2955         u32 iif;
2956         int err;
2957         int mark;
2958         struct sk_buff *skb;
2959
2960         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
2961         if (err < 0)
2962                 goto errout;
2963
2964         rtm = nlmsg_data(nlh);
2965
2966         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2967         if (skb == NULL) {
2968                 err = -ENOBUFS;
2969                 goto errout;
2970         }
2971
2972         /* Reserve room for dummy headers, this skb can pass
2973            through good chunk of routing engine.
2974          */
2975         skb_reset_mac_header(skb);
2976         skb_reset_network_header(skb);
2977
2978         /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
2979         ip_hdr(skb)->protocol = IPPROTO_ICMP;
2980         skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2981
2982         src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
2983         dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
2984         iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
2985         mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
2986
2987         if (iif) {
2988                 struct net_device *dev;
2989
2990                 dev = __dev_get_by_index(net, iif);
2991                 if (dev == NULL) {
2992                         err = -ENODEV;
2993                         goto errout_free;
2994                 }
2995
2996                 skb->protocol   = htons(ETH_P_IP);
2997                 skb->dev        = dev;
2998                 skb->mark       = mark;
2999                 local_bh_disable();
3000                 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
3001                 local_bh_enable();
3002
3003                 rt = skb_rtable(skb);
3004                 if (err == 0 && rt->dst.error)
3005                         err = -rt->dst.error;
3006         } else {
3007                 struct flowi4 fl4 = {
3008                         .daddr = dst,
3009                         .saddr = src,
3010                         .flowi4_tos = rtm->rtm_tos,
3011                         .flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
3012                         .flowi4_mark = mark,
3013                 };
3014                 rt = ip_route_output_key(net, &fl4);
3015
3016                 err = 0;
3017                 if (IS_ERR(rt))
3018                         err = PTR_ERR(rt);
3019         }
3020
3021         if (err)
3022                 goto errout_free;
3023
3024         skb_dst_set(skb, &rt->dst);
3025         if (rtm->rtm_flags & RTM_F_NOTIFY)
3026                 rt->rt_flags |= RTCF_NOTIFY;
3027
3028         err = rt_fill_info(net, skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
3029                            RTM_NEWROUTE, 0, 0);
3030         if (err <= 0)
3031                 goto errout_free;
3032
3033         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
3034 errout:
3035         return err;
3036
3037 errout_free:
3038         kfree_skb(skb);
3039         goto errout;
3040 }
3041
3042 int ip_rt_dump(struct sk_buff *skb,  struct netlink_callback *cb)
3043 {
3044         struct rtable *rt;
3045         int h, s_h;
3046         int idx, s_idx;
3047         struct net *net;
3048
3049         net = sock_net(skb->sk);
3050
3051         s_h = cb->args[0];
3052         if (s_h < 0)
3053                 s_h = 0;
3054         s_idx = idx = cb->args[1];
3055         for (h = s_h; h <= rt_hash_mask; h++, s_idx = 0) {
3056                 if (!rt_hash_table[h].chain)
3057                         continue;
3058                 rcu_read_lock_bh();
3059                 for (rt = rcu_dereference_bh(rt_hash_table[h].chain), idx = 0; rt;
3060                      rt = rcu_dereference_bh(rt->dst.rt_next), idx++) {
3061                         if (!net_eq(dev_net(rt->dst.dev), net) || idx < s_idx)
3062                                 continue;
3063                         if (rt_is_expired(rt))
3064                                 continue;
3065                         skb_dst_set_noref(skb, &rt->dst);
3066                         if (rt_fill_info(net, skb, NETLINK_CB(cb->skb).pid,
3067                                          cb->nlh->nlmsg_seq, RTM_NEWROUTE,
3068                                          1, NLM_F_MULTI) <= 0) {
3069                                 skb_dst_drop(skb);
3070                                 rcu_read_unlock_bh();
3071                                 goto done;
3072                         }
3073                         skb_dst_drop(skb);
3074                 }
3075                 rcu_read_unlock_bh();
3076         }
3077
3078 done:
3079         cb->args[0] = h;
3080         cb->args[1] = idx;
3081         return skb->len;
3082 }
3083
3084 void ip_rt_multicast_event(struct in_device *in_dev)
3085 {
3086         rt_cache_flush(dev_net(in_dev->dev), 0);
3087 }
3088
3089 #ifdef CONFIG_SYSCTL
3090 static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write,
3091                                         void __user *buffer,
3092                                         size_t *lenp, loff_t *ppos)
3093 {
3094         if (write) {
3095                 int flush_delay;
3096                 ctl_table ctl;
3097                 struct net *net;
3098
3099                 memcpy(&ctl, __ctl, sizeof(ctl));
3100                 ctl.data = &flush_delay;
3101                 proc_dointvec(&ctl, write, buffer, lenp, ppos);
3102
3103                 net = (struct net *)__ctl->extra1;
3104                 rt_cache_flush(net, flush_delay);
3105                 return 0;
3106         }
3107
3108         return -EINVAL;
3109 }
3110
3111 static ctl_table ipv4_route_table[] = {
3112         {
3113                 .procname       = "gc_thresh",
3114                 .data           = &ipv4_dst_ops.gc_thresh,
3115                 .maxlen         = sizeof(int),
3116                 .mode           = 0644,
3117                 .proc_handler   = proc_dointvec,
3118         },
3119         {
3120                 .procname       = "max_size",
3121                 .data           = &ip_rt_max_size,
3122                 .maxlen         = sizeof(int),
3123                 .mode           = 0644,
3124                 .proc_handler   = proc_dointvec,
3125         },
3126         {
3127                 /*  Deprecated. Use gc_min_interval_ms */
3128
3129                 .procname       = "gc_min_interval",
3130                 .data           = &ip_rt_gc_min_interval,
3131                 .maxlen         = sizeof(int),
3132                 .mode           = 0644,
3133                 .proc_handler   = proc_dointvec_jiffies,
3134         },
3135         {
3136                 .procname       = "gc_min_interval_ms",
3137                 .data           = &ip_rt_gc_min_interval,
3138                 .maxlen         = sizeof(int),
3139                 .mode           = 0644,
3140                 .proc_handler   = proc_dointvec_ms_jiffies,
3141         },
3142         {
3143                 .procname       = "gc_timeout",
3144                 .data           = &ip_rt_gc_timeout,
3145                 .maxlen         = sizeof(int),
3146                 .mode           = 0644,
3147                 .proc_handler   = proc_dointvec_jiffies,
3148         },
3149         {
3150                 .procname       = "gc_interval",
3151                 .data           = &ip_rt_gc_interval,
3152                 .maxlen         = sizeof(int),
3153                 .mode           = 0644,
3154                 .proc_handler   = proc_dointvec_jiffies,
3155         },
3156         {
3157                 .procname       = "redirect_load",
3158                 .data           = &ip_rt_redirect_load,
3159                 .maxlen         = sizeof(int),
3160                 .mode           = 0644,
3161                 .proc_handler   = proc_dointvec,
3162         },
3163         {
3164                 .procname       = "redirect_number",
3165                 .data           = &ip_rt_redirect_number,
3166                 .maxlen         = sizeof(int),
3167                 .mode           = 0644,
3168                 .proc_handler   = proc_dointvec,
3169         },
3170         {
3171                 .procname       = "redirect_silence",
3172                 .data           = &ip_rt_redirect_silence,
3173                 .maxlen         = sizeof(int),
3174                 .mode           = 0644,
3175                 .proc_handler   = proc_dointvec,
3176         },
3177         {
3178                 .procname       = "error_cost",
3179                 .data           = &ip_rt_error_cost,
3180                 .maxlen         = sizeof(int),
3181                 .mode           = 0644,
3182                 .proc_handler   = proc_dointvec,
3183         },
3184         {
3185                 .procname       = "error_burst",
3186                 .data           = &ip_rt_error_burst,
3187                 .maxlen         = sizeof(int),
3188                 .mode           = 0644,
3189                 .proc_handler   = proc_dointvec,
3190         },
3191         {
3192                 .procname       = "gc_elasticity",
3193                 .data           = &ip_rt_gc_elasticity,
3194                 .maxlen         = sizeof(int),
3195                 .mode           = 0644,
3196                 .proc_handler   = proc_dointvec,
3197         },
3198         {
3199                 .procname       = "mtu_expires",
3200                 .data           = &ip_rt_mtu_expires,
3201                 .maxlen         = sizeof(int),
3202                 .mode           = 0644,
3203                 .proc_handler   = proc_dointvec_jiffies,
3204         },
3205         {
3206                 .procname       = "min_pmtu",
3207                 .data           = &ip_rt_min_pmtu,
3208                 .maxlen         = sizeof(int),
3209                 .mode           = 0644,
3210                 .proc_handler   = proc_dointvec,
3211         },
3212         {
3213                 .procname       = "min_adv_mss",
3214                 .data           = &ip_rt_min_advmss,
3215                 .maxlen         = sizeof(int),
3216                 .mode           = 0644,
3217                 .proc_handler   = proc_dointvec,
3218         },
3219         { }
3220 };
3221
3222 static struct ctl_table empty[1];
3223
3224 static struct ctl_table ipv4_skeleton[] =
3225 {
3226         { .procname = "route",
3227           .mode = 0555, .child = ipv4_route_table},
3228         { .procname = "neigh",
3229           .mode = 0555, .child = empty},
3230         { }
3231 };
3232
3233 static __net_initdata struct ctl_path ipv4_path[] = {
3234         { .procname = "net", },
3235         { .procname = "ipv4", },
3236         { },
3237 };
3238
3239 static struct ctl_table ipv4_route_flush_table[] = {
3240         {
3241                 .procname       = "flush",
3242                 .maxlen         = sizeof(int),
3243                 .mode           = 0200,
3244                 .proc_handler   = ipv4_sysctl_rtcache_flush,
3245         },
3246         { },
3247 };
3248
3249 static __net_initdata struct ctl_path ipv4_route_path[] = {
3250         { .procname = "net", },
3251         { .procname = "ipv4", },
3252         { .procname = "route", },
3253         { },
3254 };
3255
3256 static __net_init int sysctl_route_net_init(struct net *net)
3257 {
3258         struct ctl_table *tbl;
3259
3260         tbl = ipv4_route_flush_table;
3261         if (!net_eq(net, &init_net)) {
3262                 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
3263                 if (tbl == NULL)
3264                         goto err_dup;
3265         }
3266         tbl[0].extra1 = net;
3267
3268         net->ipv4.route_hdr =
3269                 register_net_sysctl_table(net, ipv4_route_path, tbl);
3270         if (net->ipv4.route_hdr == NULL)
3271                 goto err_reg;
3272         return 0;
3273
3274 err_reg:
3275         if (tbl != ipv4_route_flush_table)
3276                 kfree(tbl);
3277 err_dup:
3278         return -ENOMEM;
3279 }
3280
3281 static __net_exit void sysctl_route_net_exit(struct net *net)
3282 {
3283         struct ctl_table *tbl;
3284
3285         tbl = net->ipv4.route_hdr->ctl_table_arg;
3286         unregister_net_sysctl_table(net->ipv4.route_hdr);
3287         BUG_ON(tbl == ipv4_route_flush_table);
3288         kfree(tbl);
3289 }
3290
3291 static __net_initdata struct pernet_operations sysctl_route_ops = {
3292         .init = sysctl_route_net_init,
3293         .exit = sysctl_route_net_exit,
3294 };
3295 #endif
3296
3297 static __net_init int rt_genid_init(struct net *net)
3298 {
3299         get_random_bytes(&net->ipv4.rt_genid,
3300                          sizeof(net->ipv4.rt_genid));
3301         get_random_bytes(&net->ipv4.dev_addr_genid,
3302                          sizeof(net->ipv4.dev_addr_genid));
3303         return 0;
3304 }
3305
3306 static __net_initdata struct pernet_operations rt_genid_ops = {
3307         .init = rt_genid_init,
3308 };
3309
3310
3311 #ifdef CONFIG_IP_ROUTE_CLASSID
3312 struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
3313 #endif /* CONFIG_IP_ROUTE_CLASSID */
3314
3315 static __initdata unsigned long rhash_entries;
3316 static int __init set_rhash_entries(char *str)
3317 {
3318         if (!str)
3319                 return 0;
3320         rhash_entries = simple_strtoul(str, &str, 0);
3321         return 1;
3322 }
3323 __setup("rhash_entries=", set_rhash_entries);
3324
3325 int __init ip_rt_init(void)
3326 {
3327         int rc = 0;
3328
3329 #ifdef CONFIG_IP_ROUTE_CLASSID
3330         ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
3331         if (!ip_rt_acct)
3332                 panic("IP: failed to allocate ip_rt_acct\n");
3333 #endif
3334
3335         ipv4_dst_ops.kmem_cachep =
3336                 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3337                                   SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3338
3339         ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3340
3341         if (dst_entries_init(&ipv4_dst_ops) < 0)
3342                 panic("IP: failed to allocate ipv4_dst_ops counter\n");
3343
3344         if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
3345                 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
3346
3347         rt_hash_table = (struct rt_hash_bucket *)
3348                 alloc_large_system_hash("IP route cache",
3349                                         sizeof(struct rt_hash_bucket),
3350                                         rhash_entries,
3351                                         (totalram_pages >= 128 * 1024) ?
3352                                         15 : 17,
3353                                         0,
3354                                         &rt_hash_log,
3355                                         &rt_hash_mask,
3356                                         rhash_entries ? 0 : 512 * 1024);
3357         memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
3358         rt_hash_lock_init();
3359
3360         ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
3361         ip_rt_max_size = (rt_hash_mask + 1) * 16;
3362
3363         devinet_init();
3364         ip_fib_init();
3365
3366         if (ip_rt_proc_init())
3367                 printk(KERN_ERR "Unable to create route proc files\n");
3368 #ifdef CONFIG_XFRM
3369         xfrm_init();
3370         xfrm4_init(ip_rt_max_size);
3371 #endif
3372         rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL, NULL);
3373
3374 #ifdef CONFIG_SYSCTL
3375         register_pernet_subsys(&sysctl_route_ops);
3376 #endif
3377         register_pernet_subsys(&rt_genid_ops);
3378         return rc;
3379 }
3380
3381 #ifdef CONFIG_SYSCTL
3382 /*
3383  * We really need to sanitize the damn ipv4 init order, then all
3384  * this nonsense will go away.
3385  */
3386 void __init ip_static_sysctl_init(void)
3387 {
3388         register_sysctl_paths(ipv4_path, ipv4_skeleton);
3389 }
3390 #endif