net/ipv4/route.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              ROUTE - implementation of the IP router.
   7  *
   8  * Authors:     Ross Biro
   9  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  10  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
  11  *              Linus Torvalds, <Linus.Torvalds@helsinki.fi>
  12  *              Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
  13  *
  14  * Fixes:
  15  *              Alan Cox        :       Verify area fixes.
  16  *              Alan Cox        :       cli() protects routing changes
  17  *              Rui Oliveira    :       ICMP routing table updates
  18  *              (rco@di.uminho.pt)      Routing table insertion and update
  19  *              Linus Torvalds  :       Rewrote bits to be sensible
  20  *              Alan Cox        :       Added BSD route gw semantics
  21  *              Alan Cox        :       Super /proc >4K
  22  *              Alan Cox        :       MTU in route table
  23  *              Alan Cox        :       MSS actually. Also added the window
  24  *                                      clamper.
  25  *              Sam Lantinga    :       Fixed route matching in rt_del()
  26  *              Alan Cox        :       Routing cache support.
  27  *              Alan Cox        :       Removed compatibility cruft.
  28  *              Alan Cox        :       RTF_REJECT support.
  29  *              Alan Cox        :       TCP irtt support.
  30  *              Jonathan Naylor :       Added Metric support.
  31  *      Miquel van Smoorenburg  :       BSD API fixes.
  32  *      Miquel van Smoorenburg  :       Metrics.
  33  *              Alan Cox        :       Use __u32 properly
  34  *              Alan Cox        :       Aligned routing errors more closely with BSD
  35  *                                      our system is still very different.
  36  *              Alan Cox        :       Faster /proc handling
  37  *      Alexey Kuznetsov        :       Massive rework to support tree based routing,
  38  *                                      routing caches and better behaviour.
  39  *
  40  *              Olaf Erb        :       irtt wasn't being copied right.
  41  *              Bjorn Ekwall    :       Kerneld route support.
  42  *              Alan Cox        :       Multicast fixed (I hope)
  43  *              Pavel Krauz     :       Limited broadcast fixed
  44  *              Mike McLagan    :       Routing by source
  45  *      Alexey Kuznetsov        :       End of old history. Split to fib.c and
  46  *                                      route.c and rewritten from scratch.
  47  *              Andi Kleen      :       Load-limit warning messages.
  48  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
  49  *      Vitaly E. Lavrov        :       Race condition in ip_route_input_slow.
  50  *      Tobias Ringstrom        :       Uninitialized res.type in ip_route_output_slow.
  51  *      Vladimir V. Ivanov      :       IP rule info (flowid) is really useful.
  52  *              Marc Boucher    :       routing by fwmark
  53  *      Robert Olsson           :       Added rt_cache statistics
  54  *      Arnaldo C. Melo         :       Convert proc stuff to seq_file
  55  *      Eric Dumazet            :       hashed spinlocks and rt_check_expire() fixes.
  56  *      Ilia Sotnikov           :       Ignore TOS on PMTUD and Redirect
  57  *      Ilia Sotnikov           :       Removed TOS from hash calculations
  58  *
  59  *              This program is free software; you can redistribute it and/or
  60  *              modify it under the terms of the GNU General Public License
  61  *              as published by the Free Software Foundation; either version
  62  *              2 of the License, or (at your option) any later version.
  63  */
  64
  65 #include <linux/module.h>
  66 #include <asm/uaccess.h>
  67 #include <asm/system.h>
  68 #include <linux/bitops.h>
  69 #include <linux/types.h>
  70 #include <linux/kernel.h>
  71 #include <linux/mm.h>
  72 #include <linux/bootmem.h>
  73 #include <linux/string.h>
  74 #include <linux/socket.h>
  75 #include <linux/sockios.h>
  76 #include <linux/errno.h>
  77 #include <linux/in.h>
  78 #include <linux/inet.h>
  79 #include <linux/netdevice.h>
  80 #include <linux/proc_fs.h>
  81 #include <linux/init.h>
  82 #include <linux/workqueue.h>
  83 #include <linux/skbuff.h>
  84 #include <linux/inetdevice.h>
  85 #include <linux/igmp.h>
  86 #include <linux/pkt_sched.h>
  87 #include <linux/mroute.h>
  88 #include <linux/netfilter_ipv4.h>
  89 #include <linux/random.h>
  90 #include <linux/jhash.h>
  91 #include <linux/rcupdate.h>
  92 #include <linux/times.h>
  93 #include <linux/slab.h>
  94 #include <net/dst.h>
  95 #include <net/net_namespace.h>
  96 #include <net/protocol.h>
  97 #include <net/ip.h>
  98 #include <net/route.h>
  99 #include <net/inetpeer.h>
 100 #include <net/sock.h>
 101 #include <net/ip_fib.h>
 102 #include <net/arp.h>
 103 #include <net/tcp.h>
 104 #include <net/icmp.h>
 105 #include <net/xfrm.h>
 106 #include <net/netevent.h>
 107 #include <net/rtnetlink.h>
 108 #ifdef CONFIG_SYSCTL
 109 #include <linux/sysctl.h>
 110 #endif
 111 #include <net/atmclip.h>
 112
 113 #define RT_FL_TOS(oldflp4) \
 114     ((u32)(oldflp4->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK)))
 115
 116 #define IP_MAX_MTU      0xFFF0
 117
 118 #define RT_GC_TIMEOUT (300*HZ)
 119
 120 static int ip_rt_max_size;
 121 static int ip_rt_gc_timeout __read_mostly       = RT_GC_TIMEOUT;
 122 static int ip_rt_gc_interval __read_mostly      = 60 * HZ;
 123 static int ip_rt_gc_min_interval __read_mostly  = HZ / 2;
 124 static int ip_rt_redirect_number __read_mostly  = 9;
 125 static int ip_rt_redirect_load __read_mostly    = HZ / 50;
 126 static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
 127 static int ip_rt_error_cost __read_mostly       = HZ;
 128 static int ip_rt_error_burst __read_mostly      = 5 * HZ;
 129 static int ip_rt_gc_elasticity __read_mostly    = 8;
 130 static int ip_rt_mtu_expires __read_mostly      = 10 * 60 * HZ;
 131 static int ip_rt_min_pmtu __read_mostly         = 512 + 20 + 20;
 132 static int ip_rt_min_advmss __read_mostly       = 256;
 133 static int rt_chain_length_max __read_mostly    = 20;
 134
 135 /*
 136  *      Interface to generic destination cache.
 137  */
 138
 139 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
 140 static unsigned int      ipv4_default_advmss(const struct dst_entry *dst);
 141 static unsigned int      ipv4_default_mtu(const struct dst_entry *dst);
 142 static void              ipv4_dst_destroy(struct dst_entry *dst);
 143 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
 144 static void              ipv4_link_failure(struct sk_buff *skb);
 145 static void              ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
 146 static int rt_garbage_collect(struct dst_ops *ops);
 147
 148 static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
 149                             int how)
 150 {
 151 }
 152
 153 static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
 154 {
 155         struct rtable *rt = (struct rtable *) dst;
 156         struct inet_peer *peer;
 157         u32 *p = NULL;
 158
 159         if (!rt->peer)
 160                 rt_bind_peer(rt, rt->rt_dst, 1);
 161
 162         peer = rt->peer;
 163         if (peer) {
 164                 u32 *old_p = __DST_METRICS_PTR(old);
 165                 unsigned long prev, new;
 166
 167                 p = peer->metrics;
 168                 if (inet_metrics_new(peer))
 169                         memcpy(p, old_p, sizeof(u32) * RTAX_MAX);
 170
 171                 new = (unsigned long) p;
 172                 prev = cmpxchg(&dst->_metrics, old, new);
 173
 174                 if (prev != old) {
 175                         p = __DST_METRICS_PTR(prev);
 176                         if (prev & DST_METRICS_READ_ONLY)
 177                                 p = NULL;
 178                 } else {
 179                         if (rt->fi) {
 180                                 fib_info_put(rt->fi);
 181                                 rt->fi = NULL;
 182                         }
 183                 }
 184         }
 185         return p;
 186 }
 187
 188 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, const void *daddr);
 189
 190 static struct dst_ops ipv4_dst_ops = {
 191         .family =               AF_INET,
 192         .protocol =             cpu_to_be16(ETH_P_IP),
 193         .gc =                   rt_garbage_collect,
 194         .check =                ipv4_dst_check,
 195         .default_advmss =       ipv4_default_advmss,
 196         .default_mtu =          ipv4_default_mtu,
 197         .cow_metrics =          ipv4_cow_metrics,
 198         .destroy =              ipv4_dst_destroy,
 199         .ifdown =               ipv4_dst_ifdown,
 200         .negative_advice =      ipv4_negative_advice,
 201         .link_failure =         ipv4_link_failure,
 202         .update_pmtu =          ip_rt_update_pmtu,
 203         .local_out =            __ip_local_out,
 204         .neigh_lookup =         ipv4_neigh_lookup,
 205 };
 206
 207 #define ECN_OR_COST(class)      TC_PRIO_##class
 208
 209 const __u8 ip_tos2prio[16] = {
 210         TC_PRIO_BESTEFFORT,
 211         ECN_OR_COST(BESTEFFORT),
 212         TC_PRIO_BESTEFFORT,
 213         ECN_OR_COST(BESTEFFORT),
 214         TC_PRIO_BULK,
 215         ECN_OR_COST(BULK),
 216         TC_PRIO_BULK,
 217         ECN_OR_COST(BULK),
 218         TC_PRIO_INTERACTIVE,
 219         ECN_OR_COST(INTERACTIVE),
 220         TC_PRIO_INTERACTIVE,
 221         ECN_OR_COST(INTERACTIVE),
 222         TC_PRIO_INTERACTIVE_BULK,
 223         ECN_OR_COST(INTERACTIVE_BULK),
 224         TC_PRIO_INTERACTIVE_BULK,
 225         ECN_OR_COST(INTERACTIVE_BULK)
 226 };
 227
 228
 229 /*
 230  * Route cache.
 231  */
 232
 233 /* The locking scheme is rather straight forward:
 234  *
 235  * 1) Read-Copy Update protects the buckets of the central route hash.
 236  * 2) Only writers remove entries, and they hold the lock
 237  *    as they look at rtable reference counts.
 238  * 3) Only readers acquire references to rtable entries,
 239  *    they do so with atomic increments and with the
 240  *    lock held.
 241  */
 242
 243 struct rt_hash_bucket {
 244         struct rtable __rcu     *chain;
 245 };
 246
 247 #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
 248         defined(CONFIG_PROVE_LOCKING)
 249 /*
 250  * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
 251  * The size of this table is a power of two and depends on the number of CPUS.
 252  * (on lockdep we have a quite big spinlock_t, so keep the size down there)
 253  */
 254 #ifdef CONFIG_LOCKDEP
 255 # define RT_HASH_LOCK_SZ        256
 256 #else
 257 # if NR_CPUS >= 32
 258 #  define RT_HASH_LOCK_SZ       4096
 259 # elif NR_CPUS >= 16
 260 #  define RT_HASH_LOCK_SZ       2048
 261 # elif NR_CPUS >= 8
 262 #  define RT_HASH_LOCK_SZ       1024
 263 # elif NR_CPUS >= 4
 264 #  define RT_HASH_LOCK_SZ       512
 265 # else
 266 #  define RT_HASH_LOCK_SZ       256
 267 # endif
 268 #endif
 269
 270 static spinlock_t       *rt_hash_locks;
 271 # define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
 272
 273 static __init void rt_hash_lock_init(void)
 274 {
 275         int i;
 276
 277         rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ,
 278                         GFP_KERNEL);
 279         if (!rt_hash_locks)
 280                 panic("IP: failed to allocate rt_hash_locks\n");
 281
 282         for (i = 0; i < RT_HASH_LOCK_SZ; i++)
 283                 spin_lock_init(&rt_hash_locks[i]);
 284 }
 285 #else
 286 # define rt_hash_lock_addr(slot) NULL
 287
 288 static inline void rt_hash_lock_init(void)
 289 {
 290 }
 291 #endif
 292
 293 static struct rt_hash_bucket    *rt_hash_table __read_mostly;
 294 static unsigned                 rt_hash_mask __read_mostly;
 295 static unsigned int             rt_hash_log  __read_mostly;
 296
 297 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
 298 #define RT_CACHE_STAT_INC(field) __this_cpu_inc(rt_cache_stat.field)
 299
 300 static inline unsigned int rt_hash(__be32 daddr, __be32 saddr, int idx,
 301                                    int genid)
 302 {
 303         return jhash_3words((__force u32)daddr, (__force u32)saddr,
 304                             idx, genid)
 305                 & rt_hash_mask;
 306 }
 307
 308 static inline int rt_genid(struct net *net)
 309 {
 310         return atomic_read(&net->ipv4.rt_genid);
 311 }
 312
 313 #ifdef CONFIG_PROC_FS
 314 struct rt_cache_iter_state {
 315         struct seq_net_private p;
 316         int bucket;
 317         int genid;
 318 };
 319
 320 static struct rtable *rt_cache_get_first(struct seq_file *seq)
 321 {
 322         struct rt_cache_iter_state *st = seq->private;
 323         struct rtable *r = NULL;
 324
 325         for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
 326                 if (!rcu_dereference_raw(rt_hash_table[st->bucket].chain))
 327                         continue;
 328                 rcu_read_lock_bh();
 329                 r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
 330                 while (r) {
 331                         if (dev_net(r->dst.dev) == seq_file_net(seq) &&
 332                             r->rt_genid == st->genid)
 333                                 return r;
 334                         r = rcu_dereference_bh(r->dst.rt_next);
 335                 }
 336                 rcu_read_unlock_bh();
 337         }
 338         return r;
 339 }
 340
 341 static struct rtable *__rt_cache_get_next(struct seq_file *seq,
 342                                           struct rtable *r)
 343 {
 344         struct rt_cache_iter_state *st = seq->private;
 345
 346         r = rcu_dereference_bh(r->dst.rt_next);
 347         while (!r) {
 348                 rcu_read_unlock_bh();
 349                 do {
 350                         if (--st->bucket < 0)
 351                                 return NULL;
 352                 } while (!rcu_dereference_raw(rt_hash_table[st->bucket].chain));
 353                 rcu_read_lock_bh();
 354                 r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
 355         }
 356         return r;
 357 }
 358
 359 static struct rtable *rt_cache_get_next(struct seq_file *seq,
 360                                         struct rtable *r)
 361 {
 362         struct rt_cache_iter_state *st = seq->private;
 363         while ((r = __rt_cache_get_next(seq, r)) != NULL) {
 364                 if (dev_net(r->dst.dev) != seq_file_net(seq))
 365                         continue;
 366                 if (r->rt_genid == st->genid)
 367                         break;
 368         }
 369         return r;
 370 }
 371
 372 static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
 373 {
 374         struct rtable *r = rt_cache_get_first(seq);
 375
 376         if (r)
 377                 while (pos && (r = rt_cache_get_next(seq, r)))
 378                         --pos;
 379         return pos ? NULL : r;
 380 }
 381
 382 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
 383 {
 384         struct rt_cache_iter_state *st = seq->private;
 385         if (*pos)
 386                 return rt_cache_get_idx(seq, *pos - 1);
 387         st->genid = rt_genid(seq_file_net(seq));
 388         return SEQ_START_TOKEN;
 389 }
 390
 391 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 392 {
 393         struct rtable *r;
 394
 395         if (v == SEQ_START_TOKEN)
 396                 r = rt_cache_get_first(seq);
 397         else
 398                 r = rt_cache_get_next(seq, v);
 399         ++*pos;
 400         return r;
 401 }
 402
 403 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
 404 {
 405         if (v && v != SEQ_START_TOKEN)
 406                 rcu_read_unlock_bh();
 407 }
 408
 409 static int rt_cache_seq_show(struct seq_file *seq, void *v)
 410 {
 411         if (v == SEQ_START_TOKEN)
 412                 seq_printf(seq, "%-127s\n",
 413                            "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
 414                            "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
 415                            "HHUptod\tSpecDst");
 416         else {
 417                 struct rtable *r = v;
 418                 struct neighbour *n;
 419                 int len;
 420
 421                 n = dst_get_neighbour(&r->dst);
 422                 seq_printf(seq, "%s\t%08X\t%08X\t%8X\t%d\t%u\t%d\t"
 423                               "%08X\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X%n",
 424                         r->dst.dev ? r->dst.dev->name : "*",
 425                         (__force u32)r->rt_dst,
 426                         (__force u32)r->rt_gateway,
 427                         r->rt_flags, atomic_read(&r->dst.__refcnt),
 428                         r->dst.__use, 0, (__force u32)r->rt_src,
 429                         dst_metric_advmss(&r->dst) + 40,
 430                         dst_metric(&r->dst, RTAX_WINDOW),
 431                         (int)((dst_metric(&r->dst, RTAX_RTT) >> 3) +
 432                               dst_metric(&r->dst, RTAX_RTTVAR)),
 433                         r->rt_key_tos,
 434                         -1,
 435                         (n && (n->nud_state & NUD_CONNECTED)) ? 1 : 0,
 436                         r->rt_spec_dst, &len);
 437
 438                 seq_printf(seq, "%*s\n", 127 - len, "");
 439         }
 440         return 0;
 441 }
 442
 443 static const struct seq_operations rt_cache_seq_ops = {
 444         .start  = rt_cache_seq_start,
 445         .next   = rt_cache_seq_next,
 446         .stop   = rt_cache_seq_stop,
 447         .show   = rt_cache_seq_show,
 448 };
 449
 450 static int rt_cache_seq_open(struct inode *inode, struct file *file)
 451 {
 452         return seq_open_net(inode, file, &rt_cache_seq_ops,
 453                         sizeof(struct rt_cache_iter_state));
 454 }
 455
 456 static const struct file_operations rt_cache_seq_fops = {
 457         .owner   = THIS_MODULE,
 458         .open    = rt_cache_seq_open,
 459         .read    = seq_read,
 460         .llseek  = seq_lseek,
 461         .release = seq_release_net,
 462 };
 463
 464
 465 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
 466 {
 467         int cpu;
 468
 469         if (*pos == 0)
 470                 return SEQ_START_TOKEN;
 471
 472         for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
 473                 if (!cpu_possible(cpu))
 474                         continue;
 475                 *pos = cpu+1;
 476                 return &per_cpu(rt_cache_stat, cpu);
 477         }
 478         return NULL;
 479 }
 480
 481 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 482 {
 483         int cpu;
 484
 485         for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
 486                 if (!cpu_possible(cpu))
 487                         continue;
 488                 *pos = cpu+1;
 489                 return &per_cpu(rt_cache_stat, cpu);
 490         }
 491         return NULL;
 492
 493 }
 494
 495 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
 496 {
 497
 498 }
 499
 500 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
 501 {
 502         struct rt_cache_stat *st = v;
 503
 504         if (v == SEQ_START_TOKEN) {
 505                 seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
 506                 return 0;
 507         }
 508
 509         seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
 510                    " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
 511                    dst_entries_get_slow(&ipv4_dst_ops),
 512                    st->in_hit,
 513                    st->in_slow_tot,
 514                    st->in_slow_mc,
 515                    st->in_no_route,
 516                    st->in_brd,
 517                    st->in_martian_dst,
 518                    st->in_martian_src,
 519
 520                    st->out_hit,
 521                    st->out_slow_tot,
 522                    st->out_slow_mc,
 523
 524                    st->gc_total,
 525                    st->gc_ignored,
 526                    st->gc_goal_miss,
 527                    st->gc_dst_overflow,
 528                    st->in_hlist_search,
 529                    st->out_hlist_search
 530                 );
 531         return 0;
 532 }
 533
 534 static const struct seq_operations rt_cpu_seq_ops = {
 535         .start  = rt_cpu_seq_start,
 536         .next   = rt_cpu_seq_next,
 537         .stop   = rt_cpu_seq_stop,
 538         .show   = rt_cpu_seq_show,
 539 };
 540
 541
 542 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
 543 {
 544         return seq_open(file, &rt_cpu_seq_ops);
 545 }
 546
 547 static const struct file_operations rt_cpu_seq_fops = {
 548         .owner   = THIS_MODULE,
 549         .open    = rt_cpu_seq_open,
 550         .read    = seq_read,
 551         .llseek  = seq_lseek,
 552         .release = seq_release,
 553 };
 554
 555 #ifdef CONFIG_IP_ROUTE_CLASSID
 556 static int rt_acct_proc_show(struct seq_file *m, void *v)
 557 {
 558         struct ip_rt_acct *dst, *src;
 559         unsigned int i, j;
 560
 561         dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
 562         if (!dst)
 563                 return -ENOMEM;
 564
 565         for_each_possible_cpu(i) {
 566                 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
 567                 for (j = 0; j < 256; j++) {
 568                         dst[j].o_bytes   += src[j].o_bytes;
 569                         dst[j].o_packets += src[j].o_packets;
 570                         dst[j].i_bytes   += src[j].i_bytes;
 571                         dst[j].i_packets += src[j].i_packets;
 572                 }
 573         }
 574
 575         seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
 576         kfree(dst);
 577         return 0;
 578 }
 579
 580 static int rt_acct_proc_open(struct inode *inode, struct file *file)
 581 {
 582         return single_open(file, rt_acct_proc_show, NULL);
 583 }
 584
 585 static const struct file_operations rt_acct_proc_fops = {
 586         .owner          = THIS_MODULE,
 587         .open           = rt_acct_proc_open,
 588         .read           = seq_read,
 589         .llseek         = seq_lseek,
 590         .release        = single_release,
 591 };
 592 #endif
 593
 594 static int __net_init ip_rt_do_proc_init(struct net *net)
 595 {
 596         struct proc_dir_entry *pde;
 597
 598         pde = proc_net_fops_create(net, "rt_cache", S_IRUGO,
 599                         &rt_cache_seq_fops);
 600         if (!pde)
 601                 goto err1;
 602
 603         pde = proc_create("rt_cache", S_IRUGO,
 604                           net->proc_net_stat, &rt_cpu_seq_fops);
 605         if (!pde)
 606                 goto err2;
 607
 608 #ifdef CONFIG_IP_ROUTE_CLASSID
 609         pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
 610         if (!pde)
 611                 goto err3;
 612 #endif
 613         return 0;
 614
 615 #ifdef CONFIG_IP_ROUTE_CLASSID
 616 err3:
 617         remove_proc_entry("rt_cache", net->proc_net_stat);
 618 #endif
 619 err2:
 620         remove_proc_entry("rt_cache", net->proc_net);
 621 err1:
 622         return -ENOMEM;
 623 }
 624
 625 static void __net_exit ip_rt_do_proc_exit(struct net *net)
 626 {
 627         remove_proc_entry("rt_cache", net->proc_net_stat);
 628         remove_proc_entry("rt_cache", net->proc_net);
 629 #ifdef CONFIG_IP_ROUTE_CLASSID
 630         remove_proc_entry("rt_acct", net->proc_net);
 631 #endif
 632 }
 633
 634 static struct pernet_operations ip_rt_proc_ops __net_initdata =  {
 635         .init = ip_rt_do_proc_init,
 636         .exit = ip_rt_do_proc_exit,
 637 };
 638
 639 static int __init ip_rt_proc_init(void)
 640 {
 641         return register_pernet_subsys(&ip_rt_proc_ops);
 642 }
 643
 644 #else
 645 static inline int ip_rt_proc_init(void)
 646 {
 647         return 0;
 648 }
 649 #endif /* CONFIG_PROC_FS */
 650
 651 static inline void rt_free(struct rtable *rt)
 652 {
 653         call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
 654 }
 655
 656 static inline void rt_drop(struct rtable *rt)
 657 {
 658         ip_rt_put(rt);
 659         call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
 660 }
 661
 662 static inline int rt_fast_clean(struct rtable *rth)
 663 {
 664         /* Kill broadcast/multicast entries very aggresively, if they
 665            collide in hash table with more useful entries */
 666         return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
 667                 rt_is_input_route(rth) && rth->dst.rt_next;
 668 }
 669
 670 static inline int rt_valuable(struct rtable *rth)
 671 {
 672         return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
 673                 (rth->peer && rth->peer->pmtu_expires);
 674 }
 675
 676 static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
 677 {
 678         unsigned long age;
 679         int ret = 0;
 680
 681         if (atomic_read(&rth->dst.__refcnt))
 682                 goto out;
 683
 684         age = jiffies - rth->dst.lastuse;
 685         if ((age <= tmo1 && !rt_fast_clean(rth)) ||
 686             (age <= tmo2 && rt_valuable(rth)))
 687                 goto out;
 688         ret = 1;
 689 out:    return ret;
 690 }
 691
 692 /* Bits of score are:
 693  * 31: very valuable
 694  * 30: not quite useless
 695  * 29..0: usage counter
 696  */
 697 static inline u32 rt_score(struct rtable *rt)
 698 {
 699         u32 score = jiffies - rt->dst.lastuse;
 700
 701         score = ~score & ~(3<<30);
 702
 703         if (rt_valuable(rt))
 704                 score |= (1<<31);
 705
 706         if (rt_is_output_route(rt) ||
 707             !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
 708                 score |= (1<<30);
 709
 710         return score;
 711 }
 712
 713 static inline bool rt_caching(const struct net *net)
 714 {
 715         return net->ipv4.current_rt_cache_rebuild_count <=
 716                 net->ipv4.sysctl_rt_cache_rebuild_count;
 717 }
 718
 719 static inline bool compare_hash_inputs(const struct rtable *rt1,
 720                                        const struct rtable *rt2)
 721 {
 722         return ((((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) |
 723                 ((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) |
 724                 (rt1->rt_iif ^ rt2->rt_iif)) == 0);
 725 }
 726
 727 static inline int compare_keys(struct rtable *rt1, struct rtable *rt2)
 728 {
 729         return (((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) |
 730                 ((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) |
 731                 (rt1->rt_mark ^ rt2->rt_mark) |
 732                 (rt1->rt_key_tos ^ rt2->rt_key_tos) |
 733                 (rt1->rt_oif ^ rt2->rt_oif) |
 734                 (rt1->rt_iif ^ rt2->rt_iif)) == 0;
 735 }
 736
 737 static inline int compare_netns(struct rtable *rt1, struct rtable *rt2)
 738 {
 739         return net_eq(dev_net(rt1->dst.dev), dev_net(rt2->dst.dev));
 740 }
 741
 742 static inline int rt_is_expired(struct rtable *rth)
 743 {
 744         return rth->rt_genid != rt_genid(dev_net(rth->dst.dev));
 745 }
 746
 747 /*
 748  * Perform a full scan of hash table and free all entries.
 749  * Can be called by a softirq or a process.
 750  * In the later case, we want to be reschedule if necessary
 751  */
 752 static void rt_do_flush(struct net *net, int process_context)
 753 {
 754         unsigned int i;
 755         struct rtable *rth, *next;
 756
 757         for (i = 0; i <= rt_hash_mask; i++) {
 758                 struct rtable __rcu **pprev;
 759                 struct rtable *list;
 760
 761                 if (process_context && need_resched())
 762                         cond_resched();
 763                 rth = rcu_dereference_raw(rt_hash_table[i].chain);
 764                 if (!rth)
 765                         continue;
 766
 767                 spin_lock_bh(rt_hash_lock_addr(i));
 768
 769                 list = NULL;
 770                 pprev = &rt_hash_table[i].chain;
 771                 rth = rcu_dereference_protected(*pprev,
 772                         lockdep_is_held(rt_hash_lock_addr(i)));
 773
 774                 while (rth) {
 775                         next = rcu_dereference_protected(rth->dst.rt_next,
 776                                 lockdep_is_held(rt_hash_lock_addr(i)));
 777
 778                         if (!net ||
 779                             net_eq(dev_net(rth->dst.dev), net)) {
 780                                 rcu_assign_pointer(*pprev, next);
 781                                 rcu_assign_pointer(rth->dst.rt_next, list);
 782                                 list = rth;
 783                         } else {
 784                                 pprev = &rth->dst.rt_next;
 785                         }
 786                         rth = next;
 787                 }
 788
 789                 spin_unlock_bh(rt_hash_lock_addr(i));
 790
 791                 for (; list; list = next) {
 792                         next = rcu_dereference_protected(list->dst.rt_next, 1);
 793                         rt_free(list);
 794                 }
 795         }
 796 }
 797
 798 /*
 799  * While freeing expired entries, we compute average chain length
 800  * and standard deviation, using fixed-point arithmetic.
 801  * This to have an estimation of rt_chain_length_max
 802  *  rt_chain_length_max = max(elasticity, AVG + 4*SD)
 803  * We use 3 bits for frational part, and 29 (or 61) for magnitude.
 804  */
 805
 806 #define FRACT_BITS 3
 807 #define ONE (1UL << FRACT_BITS)
 808
 809 /*
 810  * Given a hash chain and an item in this hash chain,
 811  * find if a previous entry has the same hash_inputs
 812  * (but differs on tos, mark or oif)
 813  * Returns 0 if an alias is found.
 814  * Returns ONE if rth has no alias before itself.
 815  */
 816 static int has_noalias(const struct rtable *head, const struct rtable *rth)
 817 {
 818         const struct rtable *aux = head;
 819
 820         while (aux != rth) {
 821                 if (compare_hash_inputs(aux, rth))
 822                         return 0;
 823                 aux = rcu_dereference_protected(aux->dst.rt_next, 1);
 824         }
 825         return ONE;
 826 }
 827
 828 /*
 829  * Perturbation of rt_genid by a small quantity [1..256]
 830  * Using 8 bits of shuffling ensure we can call rt_cache_invalidate()
 831  * many times (2^24) without giving recent rt_genid.
 832  * Jenkins hash is strong enough that litle changes of rt_genid are OK.
 833  */
 834 static void rt_cache_invalidate(struct net *net)
 835 {
 836         unsigned char shuffle;
 837
 838         get_random_bytes(&shuffle, sizeof(shuffle));
 839         atomic_add(shuffle + 1U, &net->ipv4.rt_genid);
 840 }
 841
 842 /*
 843  * delay < 0  : invalidate cache (fast : entries will be deleted later)
 844  * delay >= 0 : invalidate & flush cache (can be long)
 845  */
 846 void rt_cache_flush(struct net *net, int delay)
 847 {
 848         rt_cache_invalidate(net);
 849         if (delay >= 0)
 850                 rt_do_flush(net, !in_softirq());
 851 }
 852
 853 /* Flush previous cache invalidated entries from the cache */
 854 void rt_cache_flush_batch(struct net *net)
 855 {
 856         rt_do_flush(net, !in_softirq());
 857 }
 858
 859 static void rt_emergency_hash_rebuild(struct net *net)
 860 {
 861         if (net_ratelimit())
 862                 printk(KERN_WARNING "Route hash chain too long!\n");
 863         rt_cache_invalidate(net);
 864 }
 865
 866 /*
 867    Short description of GC goals.
 868
 869    We want to build algorithm, which will keep routing cache
 870    at some equilibrium point, when number of aged off entries
 871    is kept approximately equal to newly generated ones.
 872
 873    Current expiration strength is variable "expire".
 874    We try to adjust it dynamically, so that if networking
 875    is idle expires is large enough to keep enough of warm entries,
 876    and when load increases it reduces to limit cache size.
 877  */
 878
 879 static int rt_garbage_collect(struct dst_ops *ops)
 880 {
 881         static unsigned long expire = RT_GC_TIMEOUT;
 882         static unsigned long last_gc;
 883         static int rover;
 884         static int equilibrium;
 885         struct rtable *rth;
 886         struct rtable __rcu **rthp;
 887         unsigned long now = jiffies;
 888         int goal;
 889         int entries = dst_entries_get_fast(&ipv4_dst_ops);
 890
 891         /*
 892          * Garbage collection is pretty expensive,
 893          * do not make it too frequently.
 894          */
 895
 896         RT_CACHE_STAT_INC(gc_total);
 897
 898         if (now - last_gc < ip_rt_gc_min_interval &&
 899             entries < ip_rt_max_size) {
 900                 RT_CACHE_STAT_INC(gc_ignored);
 901                 goto out;
 902         }
 903
 904         entries = dst_entries_get_slow(&ipv4_dst_ops);
 905         /* Calculate number of entries, which we want to expire now. */
 906         goal = entries - (ip_rt_gc_elasticity << rt_hash_log);
 907         if (goal <= 0) {
 908                 if (equilibrium < ipv4_dst_ops.gc_thresh)
 909                         equilibrium = ipv4_dst_ops.gc_thresh;
 910                 goal = entries - equilibrium;
 911                 if (goal > 0) {
 912                         equilibrium += min_t(unsigned int, goal >> 1, rt_hash_mask + 1);
 913                         goal = entries - equilibrium;
 914                 }
 915         } else {
 916                 /* We are in dangerous area. Try to reduce cache really
 917                  * aggressively.
 918                  */
 919                 goal = max_t(unsigned int, goal >> 1, rt_hash_mask + 1);
 920                 equilibrium = entries - goal;
 921         }
 922
 923         if (now - last_gc >= ip_rt_gc_min_interval)
 924                 last_gc = now;
 925
 926         if (goal <= 0) {
 927                 equilibrium += goal;
 928                 goto work_done;
 929         }
 930
 931         do {
 932                 int i, k;
 933
 934                 for (i = rt_hash_mask, k = rover; i >= 0; i--) {
 935                         unsigned long tmo = expire;
 936
 937                         k = (k + 1) & rt_hash_mask;
 938                         rthp = &rt_hash_table[k].chain;
 939                         spin_lock_bh(rt_hash_lock_addr(k));
 940                         while ((rth = rcu_dereference_protected(*rthp,
 941                                         lockdep_is_held(rt_hash_lock_addr(k)))) != NULL) {
 942                                 if (!rt_is_expired(rth) &&
 943                                         !rt_may_expire(rth, tmo, expire)) {
 944                                         tmo >>= 1;
 945                                         rthp = &rth->dst.rt_next;
 946                                         continue;
 947                                 }
 948                                 *rthp = rth->dst.rt_next;
 949                                 rt_free(rth);
 950                                 goal--;
 951                         }
 952                         spin_unlock_bh(rt_hash_lock_addr(k));
 953                         if (goal <= 0)
 954                                 break;
 955                 }
 956                 rover = k;
 957
 958                 if (goal <= 0)
 959                         goto work_done;
 960
 961                 /* Goal is not achieved. We stop process if:
 962
 963                    - if expire reduced to zero. Otherwise, expire is halfed.
 964                    - if table is not full.
 965                    - if we are called from interrupt.
 966                    - jiffies check is just fallback/debug loop breaker.
 967                      We will not spin here for long time in any case.
 968                  */
 969
 970                 RT_CACHE_STAT_INC(gc_goal_miss);
 971
 972                 if (expire == 0)
 973                         break;
 974
 975                 expire >>= 1;
 976
 977                 if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
 978                         goto out;
 979         } while (!in_softirq() && time_before_eq(jiffies, now));
 980
 981         if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
 982                 goto out;
 983         if (dst_entries_get_slow(&ipv4_dst_ops) < ip_rt_max_size)
 984                 goto out;
 985         if (net_ratelimit())
 986                 printk(KERN_WARNING "dst cache overflow\n");
 987         RT_CACHE_STAT_INC(gc_dst_overflow);
 988         return 1;
 989
 990 work_done:
 991         expire += ip_rt_gc_min_interval;
 992         if (expire > ip_rt_gc_timeout ||
 993             dst_entries_get_fast(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh ||
 994             dst_entries_get_slow(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh)
 995                 expire = ip_rt_gc_timeout;
 996 out:    return 0;
 997 }
 998
 999 /*
1000  * Returns number of entries in a hash chain that have different hash_inputs
1001  */
1002 static int slow_chain_length(const struct rtable *head)
1003 {
1004         int length = 0;
1005         const struct rtable *rth = head;
1006
1007         while (rth) {
1008                 length += has_noalias(head, rth);
1009                 rth = rcu_dereference_protected(rth->dst.rt_next, 1);
1010         }
1011         return length >> FRACT_BITS;
1012 }
1013
1014 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, const void *daddr)
1015 {
1016         struct neigh_table *tbl = &arp_tbl;
1017         static const __be32 inaddr_any = 0;
1018         struct net_device *dev = dst->dev;
1019         const __be32 *pkey = daddr;
1020         struct neighbour *n;
1021
1022 #if defined(CONFIG_ATM_CLIP) || defined(CONFIG_ATM_CLIP_MODULE)
1023         if (dev->type == ARPHRD_ATM)
1024                 tbl = clip_tbl_hook;
1025 #endif
1026         if (dev->flags & (IFF_LOOPBACK | IFF_POINTOPOINT))
1027                 pkey = &inaddr_any;
1028
1029         n = __ipv4_neigh_lookup(tbl, dev, *(__force u32 *)pkey);
1030         if (n)
1031                 return n;
1032         return neigh_create(tbl, pkey, dev);
1033 }
1034
1035 static int rt_bind_neighbour(struct rtable *rt)
1036 {
1037         struct neighbour *n = ipv4_neigh_lookup(&rt->dst, &rt->rt_gateway);
1038         if (IS_ERR(n))
1039                 return PTR_ERR(n);
1040         dst_set_neighbour(&rt->dst, n);
1041
1042         return 0;
1043 }
1044
1045 static struct rtable *rt_intern_hash(unsigned hash, struct rtable *rt,
1046                                      struct sk_buff *skb, int ifindex)
1047 {
1048         struct rtable   *rth, *cand;
1049         struct rtable __rcu **rthp, **candp;
1050         unsigned long   now;
1051         u32             min_score;
1052         int             chain_length;
1053         int attempts = !in_softirq();
1054
1055 restart:
1056         chain_length = 0;
1057         min_score = ~(u32)0;
1058         cand = NULL;
1059         candp = NULL;
1060         now = jiffies;
1061
1062         if (!rt_caching(dev_net(rt->dst.dev))) {
1063                 /*
1064                  * If we're not caching, just tell the caller we
1065                  * were successful and don't touch the route.  The
1066                  * caller hold the sole reference to the cache entry, and
1067                  * it will be released when the caller is done with it.
1068                  * If we drop it here, the callers have no way to resolve routes
1069                  * when we're not caching.  Instead, just point *rp at rt, so
1070                  * the caller gets a single use out of the route
1071                  * Note that we do rt_free on this new route entry, so that
1072                  * once its refcount hits zero, we are still able to reap it
1073                  * (Thanks Alexey)
1074                  * Note: To avoid expensive rcu stuff for this uncached dst,
1075                  * we set DST_NOCACHE so that dst_release() can free dst without
1076                  * waiting a grace period.
1077                  */
1078
1079                 rt->dst.flags |= DST_NOCACHE;
1080                 if (rt->rt_type == RTN_UNICAST || rt_is_output_route(rt)) {
1081                         int err = rt_bind_neighbour(rt);
1082                         if (err) {
1083                                 if (net_ratelimit())
1084                                         printk(KERN_WARNING
1085                                             "Neighbour table failure & not caching routes.\n");
1086                                 ip_rt_put(rt);
1087                                 return ERR_PTR(err);
1088                         }
1089                 }
1090
1091                 goto skip_hashing;
1092         }
1093
1094         rthp = &rt_hash_table[hash].chain;
1095
1096         spin_lock_bh(rt_hash_lock_addr(hash));
1097         while ((rth = rcu_dereference_protected(*rthp,
1098                         lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
1099                 if (rt_is_expired(rth)) {
1100                         *rthp = rth->dst.rt_next;
1101                         rt_free(rth);
1102                         continue;
1103                 }
1104                 if (compare_keys(rth, rt) && compare_netns(rth, rt)) {
1105                         /* Put it first */
1106                         *rthp = rth->dst.rt_next;
1107                         /*
1108                          * Since lookup is lockfree, the deletion
1109                          * must be visible to another weakly ordered CPU before
1110                          * the insertion at the start of the hash chain.
1111                          */
1112                         rcu_assign_pointer(rth->dst.rt_next,
1113                                            rt_hash_table[hash].chain);
1114                         /*
1115                          * Since lookup is lockfree, the update writes
1116                          * must be ordered for consistency on SMP.
1117                          */
1118                         rcu_assign_pointer(rt_hash_table[hash].chain, rth);
1119
1120                         dst_use(&rth->dst, now);
1121                         spin_unlock_bh(rt_hash_lock_addr(hash));
1122
1123                         rt_drop(rt);
1124                         if (skb)
1125                                 skb_dst_set(skb, &rth->dst);
1126                         return rth;
1127                 }
1128
1129                 if (!atomic_read(&rth->dst.__refcnt)) {
1130                         u32 score = rt_score(rth);
1131
1132                         if (score <= min_score) {
1133                                 cand = rth;
1134                                 candp = rthp;
1135                                 min_score = score;
1136                         }
1137                 }
1138
1139                 chain_length++;
1140
1141                 rthp = &rth->dst.rt_next;
1142         }
1143
1144         if (cand) {
1145                 /* ip_rt_gc_elasticity used to be average length of chain
1146                  * length, when exceeded gc becomes really aggressive.
1147                  *
1148                  * The second limit is less certain. At the moment it allows
1149                  * only 2 entries per bucket. We will see.
1150                  */
1151                 if (chain_length > ip_rt_gc_elasticity) {
1152                         *candp = cand->dst.rt_next;
1153                         rt_free(cand);
1154                 }
1155         } else {
1156                 if (chain_length > rt_chain_length_max &&
1157                     slow_chain_length(rt_hash_table[hash].chain) > rt_chain_length_max) {
1158                         struct net *net = dev_net(rt->dst.dev);
1159                         int num = ++net->ipv4.current_rt_cache_rebuild_count;
1160                         if (!rt_caching(net)) {
1161                                 printk(KERN_WARNING "%s: %d rebuilds is over limit, route caching disabled\n",
1162                                         rt->dst.dev->name, num);
1163                         }
1164                         rt_emergency_hash_rebuild(net);
1165                         spin_unlock_bh(rt_hash_lock_addr(hash));
1166
1167                         hash = rt_hash(rt->rt_key_dst, rt->rt_key_src,
1168                                         ifindex, rt_genid(net));
1169                         goto restart;
1170                 }
1171         }
1172
1173         /* Try to bind route to arp only if it is output
1174            route or unicast forwarding path.
1175          */
1176         if (rt->rt_type == RTN_UNICAST || rt_is_output_route(rt)) {
1177                 int err = rt_bind_neighbour(rt);
1178                 if (err) {
1179                         spin_unlock_bh(rt_hash_lock_addr(hash));
1180
1181                         if (err != -ENOBUFS) {
1182                                 rt_drop(rt);
1183                                 return ERR_PTR(err);
1184                         }
1185
1186                         /* Neighbour tables are full and nothing
1187                            can be released. Try to shrink route cache,
1188                            it is most likely it holds some neighbour records.
1189                          */
1190                         if (attempts-- > 0) {
1191                                 int saved_elasticity = ip_rt_gc_elasticity;
1192                                 int saved_int = ip_rt_gc_min_interval;
1193                                 ip_rt_gc_elasticity     = 1;
1194                                 ip_rt_gc_min_interval   = 0;
1195                                 rt_garbage_collect(&ipv4_dst_ops);
1196                                 ip_rt_gc_min_interval   = saved_int;
1197                                 ip_rt_gc_elasticity     = saved_elasticity;
1198                                 goto restart;
1199                         }
1200
1201                         if (net_ratelimit())
1202                                 printk(KERN_WARNING "ipv4: Neighbour table overflow.\n");
1203                         rt_drop(rt);
1204                         return ERR_PTR(-ENOBUFS);
1205                 }
1206         }
1207
1208         rt->dst.rt_next = rt_hash_table[hash].chain;
1209
1210         /*
1211          * Since lookup is lockfree, we must make sure
1212          * previous writes to rt are committed to memory
1213          * before making rt visible to other CPUS.
1214          */
1215         rcu_assign_pointer(rt_hash_table[hash].chain, rt);
1216
1217         spin_unlock_bh(rt_hash_lock_addr(hash));
1218
1219 skip_hashing:
1220         if (skb)
1221                 skb_dst_set(skb, &rt->dst);
1222         return rt;
1223 }
1224
1225 static atomic_t __rt_peer_genid = ATOMIC_INIT(0);
1226
1227 static u32 rt_peer_genid(void)
1228 {
1229         return atomic_read(&__rt_peer_genid);
1230 }
1231
1232 void rt_bind_peer(struct rtable *rt, __be32 daddr, int create)
1233 {
1234         struct inet_peer *peer;
1235
1236         peer = inet_getpeer_v4(daddr, create);
1237
1238         if (peer && cmpxchg(&rt->peer, NULL, peer) != NULL)
1239                 inet_putpeer(peer);
1240         else
1241                 rt->rt_peer_genid = rt_peer_genid();
1242 }
1243
1244 /*
1245  * Peer allocation may fail only in serious out-of-memory conditions.  However
1246  * we still can generate some output.
1247  * Random ID selection looks a bit dangerous because we have no chances to
1248  * select ID being unique in a reasonable period of time.
1249  * But broken packet identifier may be better than no packet at all.
1250  */
1251 static void ip_select_fb_ident(struct iphdr *iph)
1252 {
1253         static DEFINE_SPINLOCK(ip_fb_id_lock);
1254         static u32 ip_fallback_id;
1255         u32 salt;
1256
1257         spin_lock_bh(&ip_fb_id_lock);
1258         salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
1259         iph->id = htons(salt & 0xFFFF);
1260         ip_fallback_id = salt;
1261         spin_unlock_bh(&ip_fb_id_lock);
1262 }
1263
1264 void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
1265 {
1266         struct rtable *rt = (struct rtable *) dst;
1267
1268         if (rt) {
1269                 if (rt->peer == NULL)
1270                         rt_bind_peer(rt, rt->rt_dst, 1);
1271
1272                 /* If peer is attached to destination, it is never detached,
1273                    so that we need not to grab a lock to dereference it.
1274                  */
1275                 if (rt->peer) {
1276                         iph->id = htons(inet_getid(rt->peer, more));
1277                         return;
1278                 }
1279         } else
1280                 printk(KERN_DEBUG "rt_bind_peer(0) @%p\n",
1281                        __builtin_return_address(0));
1282
1283         ip_select_fb_ident(iph);
1284 }
1285 EXPORT_SYMBOL(__ip_select_ident);
1286
1287 static void rt_del(unsigned hash, struct rtable *rt)
1288 {
1289         struct rtable __rcu **rthp;
1290         struct rtable *aux;
1291
1292         rthp = &rt_hash_table[hash].chain;
1293         spin_lock_bh(rt_hash_lock_addr(hash));
1294         ip_rt_put(rt);
1295         while ((aux = rcu_dereference_protected(*rthp,
1296                         lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
1297                 if (aux == rt || rt_is_expired(aux)) {
1298                         *rthp = aux->dst.rt_next;
1299                         rt_free(aux);
1300                         continue;
1301                 }
1302                 rthp = &aux->dst.rt_next;
1303         }
1304         spin_unlock_bh(rt_hash_lock_addr(hash));
1305 }
1306
1307 /* called in rcu_read_lock() section */
1308 void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1309                     __be32 saddr, struct net_device *dev)
1310 {
1311         struct in_device *in_dev = __in_dev_get_rcu(dev);
1312         struct inet_peer *peer;
1313         struct net *net;
1314
1315         if (!in_dev)
1316                 return;
1317
1318         net = dev_net(dev);
1319         if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
1320             ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
1321             ipv4_is_zeronet(new_gw))
1322                 goto reject_redirect;
1323
1324         if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1325                 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1326                         goto reject_redirect;
1327                 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1328                         goto reject_redirect;
1329         } else {
1330                 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
1331                         goto reject_redirect;
1332         }
1333
1334         peer = inet_getpeer_v4(daddr, 1);
1335         if (peer) {
1336                 peer->redirect_learned.a4 = new_gw;
1337
1338                 inet_putpeer(peer);
1339
1340                 atomic_inc(&__rt_peer_genid);
1341         }
1342         return;
1343
1344 reject_redirect:
1345 #ifdef CONFIG_IP_ROUTE_VERBOSE
1346         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1347                 printk(KERN_INFO "Redirect from %pI4 on %s about %pI4 ignored.\n"
1348                         "  Advised path = %pI4 -> %pI4\n",
1349                        &old_gw, dev->name, &new_gw,
1350                        &saddr, &daddr);
1351 #endif
1352         ;
1353 }
1354
1355 static bool peer_pmtu_expired(struct inet_peer *peer)
1356 {
1357         unsigned long orig = ACCESS_ONCE(peer->pmtu_expires);
1358
1359         return orig &&
1360                time_after_eq(jiffies, orig) &&
1361                cmpxchg(&peer->pmtu_expires, orig, 0) == orig;
1362 }
1363
1364 static bool peer_pmtu_cleaned(struct inet_peer *peer)
1365 {
1366         unsigned long orig = ACCESS_ONCE(peer->pmtu_expires);
1367
1368         return orig &&
1369                cmpxchg(&peer->pmtu_expires, orig, 0) == orig;
1370 }
1371
1372 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1373 {
1374         struct rtable *rt = (struct rtable *)dst;
1375         struct dst_entry *ret = dst;
1376
1377         if (rt) {
1378                 if (dst->obsolete > 0) {
1379                         ip_rt_put(rt);
1380                         ret = NULL;
1381                 } else if (rt->rt_flags & RTCF_REDIRECTED) {
1382                         unsigned hash = rt_hash(rt->rt_key_dst, rt->rt_key_src,
1383                                                 rt->rt_oif,
1384                                                 rt_genid(dev_net(dst->dev)));
1385                         rt_del(hash, rt);
1386                         ret = NULL;
1387                 } else if (rt->peer && peer_pmtu_expired(rt->peer)) {
1388                         dst_metric_set(dst, RTAX_MTU, rt->peer->pmtu_orig);
1389                 }
1390         }
1391         return ret;
1392 }
1393
1394 /*
1395  * Algorithm:
1396  *      1. The first ip_rt_redirect_number redirects are sent
1397  *         with exponential backoff, then we stop sending them at all,
1398  *         assuming that the host ignores our redirects.
1399  *      2. If we did not see packets requiring redirects
1400  *         during ip_rt_redirect_silence, we assume that the host
1401  *         forgot redirected route and start to send redirects again.
1402  *
1403  * This algorithm is much cheaper and more intelligent than dumb load limiting
1404  * in icmp.c.
1405  *
1406  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1407  * and "frag. need" (breaks PMTU discovery) in icmp.c.
1408  */
1409
1410 void ip_rt_send_redirect(struct sk_buff *skb)
1411 {
1412         struct rtable *rt = skb_rtable(skb);
1413         struct in_device *in_dev;
1414         struct inet_peer *peer;
1415         int log_martians;
1416
1417         rcu_read_lock();
1418         in_dev = __in_dev_get_rcu(rt->dst.dev);
1419         if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
1420                 rcu_read_unlock();
1421                 return;
1422         }
1423         log_martians = IN_DEV_LOG_MARTIANS(in_dev);
1424         rcu_read_unlock();
1425
1426         if (!rt->peer)
1427                 rt_bind_peer(rt, rt->rt_dst, 1);
1428         peer = rt->peer;
1429         if (!peer) {
1430                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1431                 return;
1432         }
1433
1434         /* No redirected packets during ip_rt_redirect_silence;
1435          * reset the algorithm.
1436          */
1437         if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence))
1438                 peer->rate_tokens = 0;
1439
1440         /* Too many ignored redirects; do not send anything
1441          * set dst.rate_last to the last seen redirected packet.
1442          */
1443         if (peer->rate_tokens >= ip_rt_redirect_number) {
1444                 peer->rate_last = jiffies;
1445                 return;
1446         }
1447
1448         /* Check for load limit; set rate_last to the latest sent
1449          * redirect.
1450          */
1451         if (peer->rate_tokens == 0 ||
1452             time_after(jiffies,
1453                        (peer->rate_last +
1454                         (ip_rt_redirect_load << peer->rate_tokens)))) {
1455                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1456                 peer->rate_last = jiffies;
1457                 ++peer->rate_tokens;
1458 #ifdef CONFIG_IP_ROUTE_VERBOSE
1459                 if (log_martians &&
1460                     peer->rate_tokens == ip_rt_redirect_number &&
1461                     net_ratelimit())
1462                         printk(KERN_WARNING "host %pI4/if%d ignores redirects for %pI4 to %pI4.\n",
1463                                &ip_hdr(skb)->saddr, rt->rt_iif,
1464                                 &rt->rt_dst, &rt->rt_gateway);
1465 #endif
1466         }
1467 }
1468
1469 static int ip_error(struct sk_buff *skb)
1470 {
1471         struct rtable *rt = skb_rtable(skb);
1472         struct inet_peer *peer;
1473         unsigned long now;
1474         bool send;
1475         int code;
1476
1477         switch (rt->dst.error) {
1478         case EINVAL:
1479         default:
1480                 goto out;
1481         case EHOSTUNREACH:
1482                 code = ICMP_HOST_UNREACH;
1483                 break;
1484         case ENETUNREACH:
1485                 code = ICMP_NET_UNREACH;
1486                 IP_INC_STATS_BH(dev_net(rt->dst.dev),
1487                                 IPSTATS_MIB_INNOROUTES);
1488                 break;
1489         case EACCES:
1490                 code = ICMP_PKT_FILTERED;
1491                 break;
1492         }
1493
1494         if (!rt->peer)
1495                 rt_bind_peer(rt, rt->rt_dst, 1);
1496         peer = rt->peer;
1497
1498         send = true;
1499         if (peer) {
1500                 now = jiffies;
1501                 peer->rate_tokens += now - peer->rate_last;
1502                 if (peer->rate_tokens > ip_rt_error_burst)
1503                         peer->rate_tokens = ip_rt_error_burst;
1504                 peer->rate_last = now;
1505                 if (peer->rate_tokens >= ip_rt_error_cost)
1506                         peer->rate_tokens -= ip_rt_error_cost;
1507                 else
1508                         send = false;
1509         }
1510         if (send)
1511                 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1512
1513 out:    kfree_skb(skb);
1514         return 0;
1515 }
1516
1517 /*
1518  *      The last two values are not from the RFC but
1519  *      are needed for AMPRnet AX.25 paths.
1520  */
1521
1522 static const unsigned short mtu_plateau[] =
1523 {32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1524
1525 static inline unsigned short guess_mtu(unsigned short old_mtu)
1526 {
1527         int i;
1528
1529         for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
1530                 if (old_mtu > mtu_plateau[i])
1531                         return mtu_plateau[i];
1532         return 68;
1533 }
1534
1535 unsigned short ip_rt_frag_needed(struct net *net, const struct iphdr *iph,
1536                                  unsigned short new_mtu,
1537                                  struct net_device *dev)
1538 {
1539         unsigned short old_mtu = ntohs(iph->tot_len);
1540         unsigned short est_mtu = 0;
1541         struct inet_peer *peer;
1542
1543         peer = inet_getpeer_v4(iph->daddr, 1);
1544         if (peer) {
1545                 unsigned short mtu = new_mtu;
1546
1547                 if (new_mtu < 68 || new_mtu >= old_mtu) {
1548                         /* BSD 4.2 derived systems incorrectly adjust
1549                          * tot_len by the IP header length, and report
1550                          * a zero MTU in the ICMP message.
1551                          */
1552                         if (mtu == 0 &&
1553                             old_mtu >= 68 + (iph->ihl << 2))
1554                                 old_mtu -= iph->ihl << 2;
1555                         mtu = guess_mtu(old_mtu);
1556                 }
1557
1558                 if (mtu < ip_rt_min_pmtu)
1559                         mtu = ip_rt_min_pmtu;
1560                 if (!peer->pmtu_expires || mtu < peer->pmtu_learned) {
1561                         unsigned long pmtu_expires;
1562
1563                         pmtu_expires = jiffies + ip_rt_mtu_expires;
1564                         if (!pmtu_expires)
1565                                 pmtu_expires = 1UL;
1566
1567                         est_mtu = mtu;
1568                         peer->pmtu_learned = mtu;
1569                         peer->pmtu_expires = pmtu_expires;
1570                 }
1571
1572                 inet_putpeer(peer);
1573
1574                 atomic_inc(&__rt_peer_genid);
1575         }
1576         return est_mtu ? : new_mtu;
1577 }
1578
1579 static void check_peer_pmtu(struct dst_entry *dst, struct inet_peer *peer)
1580 {
1581         unsigned long expires = ACCESS_ONCE(peer->pmtu_expires);
1582
1583         if (!expires)
1584                 return;
1585         if (time_before(jiffies, expires)) {
1586                 u32 orig_dst_mtu = dst_mtu(dst);
1587                 if (peer->pmtu_learned < orig_dst_mtu) {
1588                         if (!peer->pmtu_orig)
1589                                 peer->pmtu_orig = dst_metric_raw(dst, RTAX_MTU);
1590                         dst_metric_set(dst, RTAX_MTU, peer->pmtu_learned);
1591                 }
1592         } else if (cmpxchg(&peer->pmtu_expires, expires, 0) == expires)
1593                 dst_metric_set(dst, RTAX_MTU, peer->pmtu_orig);
1594 }
1595
1596 static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1597 {
1598         struct rtable *rt = (struct rtable *) dst;
1599         struct inet_peer *peer;
1600
1601         dst_confirm(dst);
1602
1603         if (!rt->peer)
1604                 rt_bind_peer(rt, rt->rt_dst, 1);
1605         peer = rt->peer;
1606         if (peer) {
1607                 unsigned long pmtu_expires = ACCESS_ONCE(peer->pmtu_expires);
1608
1609                 if (mtu < ip_rt_min_pmtu)
1610                         mtu = ip_rt_min_pmtu;
1611                 if (!pmtu_expires || mtu < peer->pmtu_learned) {
1612
1613                         pmtu_expires = jiffies + ip_rt_mtu_expires;
1614                         if (!pmtu_expires)
1615                                 pmtu_expires = 1UL;
1616
1617                         peer->pmtu_learned = mtu;
1618                         peer->pmtu_expires = pmtu_expires;
1619
1620                         atomic_inc(&__rt_peer_genid);
1621                         rt->rt_peer_genid = rt_peer_genid();
1622                 }
1623                 check_peer_pmtu(dst, peer);
1624         }
1625 }
1626
1627 static int check_peer_redir(struct dst_entry *dst, struct inet_peer *peer)
1628 {
1629         struct rtable *rt = (struct rtable *) dst;
1630         __be32 orig_gw = rt->rt_gateway;
1631         struct neighbour *n, *old_n;
1632
1633         dst_confirm(&rt->dst);
1634
1635         rt->rt_gateway = peer->redirect_learned.a4;
1636
1637         n = ipv4_neigh_lookup(&rt->dst, &rt->rt_gateway);
1638         if (IS_ERR(n))
1639                 return PTR_ERR(n);
1640         old_n = xchg(&rt->dst._neighbour, n);
1641         if (old_n)
1642                 neigh_release(old_n);
1643         if (!n || !(n->nud_state & NUD_VALID)) {
1644                 if (n)
1645                         neigh_event_send(n, NULL);
1646                 rt->rt_gateway = orig_gw;
1647                 return -EAGAIN;
1648         } else {
1649                 rt->rt_flags |= RTCF_REDIRECTED;
1650                 call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
1651         }
1652         return 0;
1653 }
1654
1655 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1656 {
1657         struct rtable *rt = (struct rtable *) dst;
1658
1659         if (rt_is_expired(rt))
1660                 return NULL;
1661         if (rt->rt_peer_genid != rt_peer_genid()) {
1662                 struct inet_peer *peer;
1663
1664                 if (!rt->peer)
1665                         rt_bind_peer(rt, rt->rt_dst, 0);
1666
1667                 peer = rt->peer;
1668                 if (peer) {
1669                         check_peer_pmtu(dst, peer);
1670
1671                         if (peer->redirect_learned.a4 &&
1672                             peer->redirect_learned.a4 != rt->rt_gateway) {
1673                                 if (check_peer_redir(dst, peer))
1674                                         return NULL;
1675                         }
1676                 }
1677
1678                 rt->rt_peer_genid = rt_peer_genid();
1679         }
1680         return dst;
1681 }
1682
1683 static void ipv4_dst_destroy(struct dst_entry *dst)
1684 {
1685         struct rtable *rt = (struct rtable *) dst;
1686         struct inet_peer *peer = rt->peer;
1687
1688         if (rt->fi) {
1689                 fib_info_put(rt->fi);
1690                 rt->fi = NULL;
1691         }
1692         if (peer) {
1693                 rt->peer = NULL;
1694                 inet_putpeer(peer);
1695         }
1696 }
1697
1698
1699 static void ipv4_link_failure(struct sk_buff *skb)
1700 {
1701         struct rtable *rt;
1702
1703         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1704
1705         rt = skb_rtable(skb);
1706         if (rt && rt->peer && peer_pmtu_cleaned(rt->peer))
1707                 dst_metric_set(&rt->dst, RTAX_MTU, rt->peer->pmtu_orig);
1708 }
1709
1710 static int ip_rt_bug(struct sk_buff *skb)
1711 {
1712         printk(KERN_DEBUG "ip_rt_bug: %pI4 -> %pI4, %s\n",
1713                 &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1714                 skb->dev ? skb->dev->name : "?");
1715         kfree_skb(skb);
1716         WARN_ON(1);
1717         return 0;
1718 }
1719
1720 /*
1721    We do not cache source address of outgoing interface,
1722    because it is used only by IP RR, TS and SRR options,
1723    so that it out of fast path.
1724
1725    BTW remember: "addr" is allowed to be not aligned
1726    in IP options!
1727  */
1728
1729 void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1730 {
1731         __be32 src;
1732
1733         if (rt_is_output_route(rt))
1734                 src = ip_hdr(skb)->saddr;
1735         else {
1736                 struct fib_result res;
1737                 struct flowi4 fl4;
1738                 struct iphdr *iph;
1739
1740                 iph = ip_hdr(skb);
1741
1742                 memset(&fl4, 0, sizeof(fl4));
1743                 fl4.daddr = iph->daddr;
1744                 fl4.saddr = iph->saddr;
1745                 fl4.flowi4_tos = RT_TOS(iph->tos);
1746                 fl4.flowi4_oif = rt->dst.dev->ifindex;
1747                 fl4.flowi4_iif = skb->dev->ifindex;
1748                 fl4.flowi4_mark = skb->mark;
1749
1750                 rcu_read_lock();
1751                 if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res) == 0)
1752                         src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res);
1753                 else
1754                         src = inet_select_addr(rt->dst.dev, rt->rt_gateway,
1755                                         RT_SCOPE_UNIVERSE);
1756                 rcu_read_unlock();
1757         }
1758         memcpy(addr, &src, 4);
1759 }
1760
1761 #ifdef CONFIG_IP_ROUTE_CLASSID
1762 static void set_class_tag(struct rtable *rt, u32 tag)
1763 {
1764         if (!(rt->dst.tclassid & 0xFFFF))
1765                 rt->dst.tclassid |= tag & 0xFFFF;
1766         if (!(rt->dst.tclassid & 0xFFFF0000))
1767                 rt->dst.tclassid |= tag & 0xFFFF0000;
1768 }
1769 #endif
1770
1771 static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1772 {
1773         unsigned int advmss = dst_metric_raw(dst, RTAX_ADVMSS);
1774
1775         if (advmss == 0) {
1776                 advmss = max_t(unsigned int, dst->dev->mtu - 40,
1777                                ip_rt_min_advmss);
1778                 if (advmss > 65535 - 40)
1779                         advmss = 65535 - 40;
1780         }
1781         return advmss;
1782 }
1783
1784 static unsigned int ipv4_default_mtu(const struct dst_entry *dst)
1785 {
1786         unsigned int mtu = dst->dev->mtu;
1787
1788         if (unlikely(dst_metric_locked(dst, RTAX_MTU))) {
1789                 const struct rtable *rt = (const struct rtable *) dst;
1790
1791                 if (rt->rt_gateway != rt->rt_dst && mtu > 576)
1792                         mtu = 576;
1793         }
1794
1795         if (mtu > IP_MAX_MTU)
1796                 mtu = IP_MAX_MTU;
1797
1798         return mtu;
1799 }
1800
1801 static void rt_init_metrics(struct rtable *rt, const struct flowi4 *fl4,
1802                             struct fib_info *fi)
1803 {
1804         struct inet_peer *peer;
1805         int create = 0;
1806
1807         /* If a peer entry exists for this destination, we must hook
1808          * it up in order to get at cached metrics.
1809          */
1810         if (fl4 && (fl4->flowi4_flags & FLOWI_FLAG_PRECOW_METRICS))
1811                 create = 1;
1812
1813         rt->peer = peer = inet_getpeer_v4(rt->rt_dst, create);
1814         if (peer) {
1815                 rt->rt_peer_genid = rt_peer_genid();
1816                 if (inet_metrics_new(peer))
1817                         memcpy(peer->metrics, fi->fib_metrics,
1818                                sizeof(u32) * RTAX_MAX);
1819                 dst_init_metrics(&rt->dst, peer->metrics, false);
1820
1821                 check_peer_pmtu(&rt->dst, peer);
1822                 if (peer->redirect_learned.a4 &&
1823                     peer->redirect_learned.a4 != rt->rt_gateway) {
1824                         rt->rt_gateway = peer->redirect_learned.a4;
1825                         rt->rt_flags |= RTCF_REDIRECTED;
1826                 }
1827         } else {
1828                 if (fi->fib_metrics != (u32 *) dst_default_metrics) {
1829                         rt->fi = fi;
1830                         atomic_inc(&fi->fib_clntref);
1831                 }
1832                 dst_init_metrics(&rt->dst, fi->fib_metrics, true);
1833         }
1834 }
1835
1836 static void rt_set_nexthop(struct rtable *rt, const struct flowi4 *fl4,
1837                            const struct fib_result *res,
1838                            struct fib_info *fi, u16 type, u32 itag)
1839 {
1840         struct dst_entry *dst = &rt->dst;
1841
1842         if (fi) {
1843                 if (FIB_RES_GW(*res) &&
1844                     FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1845                         rt->rt_gateway = FIB_RES_GW(*res);
1846                 rt_init_metrics(rt, fl4, fi);
1847 #ifdef CONFIG_IP_ROUTE_CLASSID
1848                 dst->tclassid = FIB_RES_NH(*res).nh_tclassid;
1849 #endif
1850         }
1851
1852         if (dst_mtu(dst) > IP_MAX_MTU)
1853                 dst_metric_set(dst, RTAX_MTU, IP_MAX_MTU);
1854         if (dst_metric_raw(dst, RTAX_ADVMSS) > 65535 - 40)
1855                 dst_metric_set(dst, RTAX_ADVMSS, 65535 - 40);
1856
1857 #ifdef CONFIG_IP_ROUTE_CLASSID
1858 #ifdef CONFIG_IP_MULTIPLE_TABLES
1859         set_class_tag(rt, fib_rules_tclass(res));
1860 #endif
1861         set_class_tag(rt, itag);
1862 #endif
1863 }
1864
1865 static struct rtable *rt_dst_alloc(struct net_device *dev,
1866                                    bool nopolicy, bool noxfrm)
1867 {
1868         return dst_alloc(&ipv4_dst_ops, dev, 1, -1,
1869                          DST_HOST |
1870                          (nopolicy ? DST_NOPOLICY : 0) |
1871                          (noxfrm ? DST_NOXFRM : 0));
1872 }
1873
1874 /* called in rcu_read_lock() section */
1875 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1876                                 u8 tos, struct net_device *dev, int our)
1877 {
1878         unsigned int hash;
1879         struct rtable *rth;
1880         __be32 spec_dst;
1881         struct in_device *in_dev = __in_dev_get_rcu(dev);
1882         u32 itag = 0;
1883         int err;
1884
1885         /* Primary sanity checks. */
1886
1887         if (in_dev == NULL)
1888                 return -EINVAL;
1889
1890         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1891             ipv4_is_loopback(saddr) || skb->protocol != htons(ETH_P_IP))
1892                 goto e_inval;
1893
1894         if (ipv4_is_zeronet(saddr)) {
1895                 if (!ipv4_is_local_multicast(daddr))
1896                         goto e_inval;
1897                 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1898         } else {
1899                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev, &spec_dst,
1900                                           &itag);
1901                 if (err < 0)
1902                         goto e_err;
1903         }
1904         rth = rt_dst_alloc(init_net.loopback_dev,
1905                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
1906         if (!rth)
1907                 goto e_nobufs;
1908
1909 #ifdef CONFIG_IP_ROUTE_CLASSID
1910         rth->dst.tclassid = itag;
1911 #endif
1912         rth->dst.output = ip_rt_bug;
1913
1914         rth->rt_key_dst = daddr;
1915         rth->rt_key_src = saddr;
1916         rth->rt_genid   = rt_genid(dev_net(dev));
1917         rth->rt_flags   = RTCF_MULTICAST;
1918         rth->rt_type    = RTN_MULTICAST;
1919         rth->rt_key_tos = tos;
1920         rth->rt_dst     = daddr;
1921         rth->rt_src     = saddr;
1922         rth->rt_route_iif = dev->ifindex;
1923         rth->rt_iif     = dev->ifindex;
1924         rth->rt_oif     = 0;
1925         rth->rt_mark    = skb->mark;
1926         rth->rt_gateway = daddr;
1927         rth->rt_spec_dst= spec_dst;
1928         rth->rt_peer_genid = 0;
1929         rth->peer = NULL;
1930         rth->fi = NULL;
1931         if (our) {
1932                 rth->dst.input= ip_local_deliver;
1933                 rth->rt_flags |= RTCF_LOCAL;
1934         }
1935
1936 #ifdef CONFIG_IP_MROUTE
1937         if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1938                 rth->dst.input = ip_mr_input;
1939 #endif
1940         RT_CACHE_STAT_INC(in_slow_mc);
1941
1942         hash = rt_hash(daddr, saddr, dev->ifindex, rt_genid(dev_net(dev)));
1943         rth = rt_intern_hash(hash, rth, skb, dev->ifindex);
1944         return IS_ERR(rth) ? PTR_ERR(rth) : 0;
1945
1946 e_nobufs:
1947         return -ENOBUFS;
1948 e_inval:
1949         return -EINVAL;
1950 e_err:
1951         return err;
1952 }
1953
1954
1955 static void ip_handle_martian_source(struct net_device *dev,
1956                                      struct in_device *in_dev,
1957                                      struct sk_buff *skb,
1958                                      __be32 daddr,
1959                                      __be32 saddr)
1960 {
1961         RT_CACHE_STAT_INC(in_martian_src);
1962 #ifdef CONFIG_IP_ROUTE_VERBOSE
1963         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1964                 /*
1965                  *      RFC1812 recommendation, if source is martian,
1966                  *      the only hint is MAC header.
1967                  */
1968                 printk(KERN_WARNING "martian source %pI4 from %pI4, on dev %s\n",
1969                         &daddr, &saddr, dev->name);
1970                 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1971                         int i;
1972                         const unsigned char *p = skb_mac_header(skb);
1973                         printk(KERN_WARNING "ll header: ");
1974                         for (i = 0; i < dev->hard_header_len; i++, p++) {
1975                                 printk("%02x", *p);
1976                                 if (i < (dev->hard_header_len - 1))
1977                                         printk(":");
1978                         }
1979                         printk("\n");
1980                 }
1981         }
1982 #endif
1983 }
1984
1985 /* called in rcu_read_lock() section */
1986 static int __mkroute_input(struct sk_buff *skb,
1987                            const struct fib_result *res,
1988                            struct in_device *in_dev,
1989                            __be32 daddr, __be32 saddr, u32 tos,
1990                            struct rtable **result)
1991 {
1992         struct rtable *rth;
1993         int err;
1994         struct in_device *out_dev;
1995         unsigned int flags = 0;
1996         __be32 spec_dst;
1997         u32 itag;
1998
1999         /* get a working reference to the output device */
2000         out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
2001         if (out_dev == NULL) {
2002                 if (net_ratelimit())
2003                         printk(KERN_CRIT "Bug in ip_route_input" \
2004                                "_slow(). Please, report\n");
2005                 return -EINVAL;
2006         }
2007
2008
2009         err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
2010                                   in_dev->dev, &spec_dst, &itag);
2011         if (err < 0) {
2012                 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
2013                                          saddr);
2014
2015                 goto cleanup;
2016         }
2017
2018         if (err)
2019                 flags |= RTCF_DIRECTSRC;
2020
2021         if (out_dev == in_dev && err &&
2022             (IN_DEV_SHARED_MEDIA(out_dev) ||
2023              inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
2024                 flags |= RTCF_DOREDIRECT;
2025
2026         if (skb->protocol != htons(ETH_P_IP)) {
2027                 /* Not IP (i.e. ARP). Do not create route, if it is
2028                  * invalid for proxy arp. DNAT routes are always valid.
2029                  *
2030                  * Proxy arp feature have been extended to allow, ARP
2031                  * replies back to the same interface, to support
2032                  * Private VLAN switch technologies. See arp.c.
2033                  */
2034                 if (out_dev == in_dev &&
2035                     IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
2036                         err = -EINVAL;
2037                         goto cleanup;
2038                 }
2039         }
2040
2041         rth = rt_dst_alloc(out_dev->dev,
2042                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
2043                            IN_DEV_CONF_GET(out_dev, NOXFRM));
2044         if (!rth) {
2045                 err = -ENOBUFS;
2046                 goto cleanup;
2047         }
2048
2049         rth->rt_key_dst = daddr;
2050         rth->rt_key_src = saddr;
2051         rth->rt_genid = rt_genid(dev_net(rth->dst.dev));
2052         rth->rt_flags = flags;
2053         rth->rt_type = res->type;
2054         rth->rt_key_tos = tos;
2055         rth->rt_dst     = daddr;
2056         rth->rt_src     = saddr;
2057         rth->rt_route_iif = in_dev->dev->ifindex;
2058         rth->rt_iif     = in_dev->dev->ifindex;
2059         rth->rt_oif     = 0;
2060         rth->rt_mark    = skb->mark;
2061         rth->rt_gateway = daddr;
2062         rth->rt_spec_dst= spec_dst;
2063         rth->rt_peer_genid = 0;
2064         rth->peer = NULL;
2065         rth->fi = NULL;
2066
2067         rth->dst.input = ip_forward;
2068         rth->dst.output = ip_output;
2069
2070         rt_set_nexthop(rth, NULL, res, res->fi, res->type, itag);
2071
2072         *result = rth;
2073         err = 0;
2074  cleanup:
2075         return err;
2076 }
2077
2078 static int ip_mkroute_input(struct sk_buff *skb,
2079                             struct fib_result *res,
2080                             const struct flowi4 *fl4,
2081                             struct in_device *in_dev,
2082                             __be32 daddr, __be32 saddr, u32 tos)
2083 {
2084         struct rtable* rth = NULL;
2085         int err;
2086         unsigned hash;
2087
2088 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2089         if (res->fi && res->fi->fib_nhs > 1)
2090                 fib_select_multipath(res);
2091 #endif
2092
2093         /* create a routing cache entry */
2094         err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
2095         if (err)
2096                 return err;
2097
2098         /* put it into the cache */
2099         hash = rt_hash(daddr, saddr, fl4->flowi4_iif,
2100                        rt_genid(dev_net(rth->dst.dev)));
2101         rth = rt_intern_hash(hash, rth, skb, fl4->flowi4_iif);
2102         if (IS_ERR(rth))
2103                 return PTR_ERR(rth);
2104         return 0;
2105 }
2106
2107 /*
2108  *      NOTE. We drop all the packets that has local source
2109  *      addresses, because every properly looped back packet
2110  *      must have correct destination already attached by output routine.
2111  *
2112  *      Such approach solves two big problems:
2113  *      1. Not simplex devices are handled properly.
2114  *      2. IP spoofing attempts are filtered with 100% of guarantee.
2115  *      called with rcu_read_lock()
2116  */
2117
2118 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2119                                u8 tos, struct net_device *dev)
2120 {
2121         struct fib_result res;
2122         struct in_device *in_dev = __in_dev_get_rcu(dev);
2123         struct flowi4   fl4;
2124         unsigned        flags = 0;
2125         u32             itag = 0;
2126         struct rtable * rth;
2127         unsigned        hash;
2128         __be32          spec_dst;
2129         int             err = -EINVAL;
2130         struct net    * net = dev_net(dev);
2131
2132         /* IP on this device is disabled. */
2133
2134         if (!in_dev)
2135                 goto out;
2136
2137         /* Check for the most weird martians, which can be not detected
2138            by fib_lookup.
2139          */
2140
2141         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
2142             ipv4_is_loopback(saddr))
2143                 goto martian_source;
2144
2145         if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
2146                 goto brd_input;
2147
2148         /* Accept zero addresses only to limited broadcast;
2149          * I even do not know to fix it or not. Waiting for complains :-)
2150          */
2151         if (ipv4_is_zeronet(saddr))
2152                 goto martian_source;
2153
2154         if (ipv4_is_zeronet(daddr) || ipv4_is_loopback(daddr))
2155                 goto martian_destination;
2156
2157         /*
2158          *      Now we are ready to route packet.
2159          */
2160         fl4.flowi4_oif = 0;
2161         fl4.flowi4_iif = dev->ifindex;
2162         fl4.flowi4_mark = skb->mark;
2163         fl4.flowi4_tos = tos;
2164         fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
2165         fl4.daddr = daddr;
2166         fl4.saddr = saddr;
2167         err = fib_lookup(net, &fl4, &res);
2168         if (err != 0) {
2169                 if (!IN_DEV_FORWARD(in_dev))
2170                         goto e_hostunreach;
2171                 goto no_route;
2172         }
2173
2174         RT_CACHE_STAT_INC(in_slow_tot);
2175
2176         if (res.type == RTN_BROADCAST)
2177                 goto brd_input;
2178
2179         if (res.type == RTN_LOCAL) {
2180                 err = fib_validate_source(skb, saddr, daddr, tos,
2181                                           net->loopback_dev->ifindex,
2182                                           dev, &spec_dst, &itag);
2183                 if (err < 0)
2184                         goto martian_source_keep_err;
2185                 if (err)
2186                         flags |= RTCF_DIRECTSRC;
2187                 spec_dst = daddr;
2188                 goto local_input;
2189         }
2190
2191         if (!IN_DEV_FORWARD(in_dev))
2192                 goto e_hostunreach;
2193         if (res.type != RTN_UNICAST)
2194                 goto martian_destination;
2195
2196         err = ip_mkroute_input(skb, &res, &fl4, in_dev, daddr, saddr, tos);
2197 out:    return err;
2198
2199 brd_input:
2200         if (skb->protocol != htons(ETH_P_IP))
2201                 goto e_inval;
2202
2203         if (ipv4_is_zeronet(saddr))
2204                 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
2205         else {
2206                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev, &spec_dst,
2207                                           &itag);
2208                 if (err < 0)
2209                         goto martian_source_keep_err;
2210                 if (err)
2211                         flags |= RTCF_DIRECTSRC;
2212         }
2213         flags |= RTCF_BROADCAST;
2214         res.type = RTN_BROADCAST;
2215         RT_CACHE_STAT_INC(in_brd);
2216
2217 local_input:
2218         rth = rt_dst_alloc(net->loopback_dev,
2219                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
2220         if (!rth)
2221                 goto e_nobufs;
2222
2223         rth->dst.input= ip_local_deliver;
2224         rth->dst.output= ip_rt_bug;
2225 #ifdef CONFIG_IP_ROUTE_CLASSID
2226         rth->dst.tclassid = itag;
2227 #endif
2228
2229         rth->rt_key_dst = daddr;
2230         rth->rt_key_src = saddr;
2231         rth->rt_genid = rt_genid(net);
2232         rth->rt_flags   = flags|RTCF_LOCAL;
2233         rth->rt_type    = res.type;
2234         rth->rt_key_tos = tos;
2235         rth->rt_dst     = daddr;
2236         rth->rt_src     = saddr;
2237 #ifdef CONFIG_IP_ROUTE_CLASSID
2238         rth->dst.tclassid = itag;
2239 #endif
2240         rth->rt_route_iif = dev->ifindex;
2241         rth->rt_iif     = dev->ifindex;
2242         rth->rt_oif     = 0;
2243         rth->rt_mark    = skb->mark;
2244         rth->rt_gateway = daddr;
2245         rth->rt_spec_dst= spec_dst;
2246         rth->rt_peer_genid = 0;
2247         rth->peer = NULL;
2248         rth->fi = NULL;
2249         if (res.type == RTN_UNREACHABLE) {
2250                 rth->dst.input= ip_error;
2251                 rth->dst.error= -err;
2252                 rth->rt_flags   &= ~RTCF_LOCAL;
2253         }
2254         hash = rt_hash(daddr, saddr, fl4.flowi4_iif, rt_genid(net));
2255         rth = rt_intern_hash(hash, rth, skb, fl4.flowi4_iif);
2256         err = 0;
2257         if (IS_ERR(rth))
2258                 err = PTR_ERR(rth);
2259         goto out;
2260
2261 no_route:
2262         RT_CACHE_STAT_INC(in_no_route);
2263         spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
2264         res.type = RTN_UNREACHABLE;
2265         if (err == -ESRCH)
2266                 err = -ENETUNREACH;
2267         goto local_input;
2268
2269         /*
2270          *      Do not cache martian addresses: they should be logged (RFC1812)
2271          */
2272 martian_destination:
2273         RT_CACHE_STAT_INC(in_martian_dst);
2274 #ifdef CONFIG_IP_ROUTE_VERBOSE
2275         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
2276                 printk(KERN_WARNING "martian destination %pI4 from %pI4, dev %s\n",
2277                         &daddr, &saddr, dev->name);
2278 #endif
2279
2280 e_hostunreach:
2281         err = -EHOSTUNREACH;
2282         goto out;
2283
2284 e_inval:
2285         err = -EINVAL;
2286         goto out;
2287
2288 e_nobufs:
2289         err = -ENOBUFS;
2290         goto out;
2291
2292 martian_source:
2293         err = -EINVAL;
2294 martian_source_keep_err:
2295         ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2296         goto out;
2297 }
2298
2299 int ip_route_input_common(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2300                            u8 tos, struct net_device *dev, bool noref)
2301 {
2302         struct rtable * rth;
2303         unsigned        hash;
2304         int iif = dev->ifindex;
2305         struct net *net;
2306         int res;
2307
2308         net = dev_net(dev);
2309
2310         rcu_read_lock();
2311
2312         if (!rt_caching(net))
2313                 goto skip_cache;
2314
2315         tos &= IPTOS_RT_MASK;
2316         hash = rt_hash(daddr, saddr, iif, rt_genid(net));
2317
2318         for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2319              rth = rcu_dereference(rth->dst.rt_next)) {
2320                 if ((((__force u32)rth->rt_key_dst ^ (__force u32)daddr) |
2321                      ((__force u32)rth->rt_key_src ^ (__force u32)saddr) |
2322                      (rth->rt_iif ^ iif) |
2323                      rth->rt_oif |
2324                      (rth->rt_key_tos ^ tos)) == 0 &&
2325                     rth->rt_mark == skb->mark &&
2326                     net_eq(dev_net(rth->dst.dev), net) &&
2327                     !rt_is_expired(rth)) {
2328                         if (noref) {
2329                                 dst_use_noref(&rth->dst, jiffies);
2330                                 skb_dst_set_noref(skb, &rth->dst);
2331                         } else {
2332                                 dst_use(&rth->dst, jiffies);
2333                                 skb_dst_set(skb, &rth->dst);
2334                         }
2335                         RT_CACHE_STAT_INC(in_hit);
2336                         rcu_read_unlock();
2337                         return 0;
2338                 }
2339                 RT_CACHE_STAT_INC(in_hlist_search);
2340         }
2341
2342 skip_cache:
2343         /* Multicast recognition logic is moved from route cache to here.
2344            The problem was that too many Ethernet cards have broken/missing
2345            hardware multicast filters :-( As result the host on multicasting
2346            network acquires a lot of useless route cache entries, sort of
2347            SDR messages from all the world. Now we try to get rid of them.
2348            Really, provided software IP multicast filter is organized
2349            reasonably (at least, hashed), it does not result in a slowdown
2350            comparing with route cache reject entries.
2351            Note, that multicast routers are not affected, because
2352            route cache entry is created eventually.
2353          */
2354         if (ipv4_is_multicast(daddr)) {
2355                 struct in_device *in_dev = __in_dev_get_rcu(dev);
2356
2357                 if (in_dev) {
2358                         int our = ip_check_mc_rcu(in_dev, daddr, saddr,
2359                                                   ip_hdr(skb)->protocol);
2360                         if (our
2361 #ifdef CONFIG_IP_MROUTE
2362                                 ||
2363                             (!ipv4_is_local_multicast(daddr) &&
2364                              IN_DEV_MFORWARD(in_dev))
2365 #endif
2366                            ) {
2367                                 int res = ip_route_input_mc(skb, daddr, saddr,
2368                                                             tos, dev, our);
2369                                 rcu_read_unlock();
2370                                 return res;
2371                         }
2372                 }
2373                 rcu_read_unlock();
2374                 return -EINVAL;
2375         }
2376         res = ip_route_input_slow(skb, daddr, saddr, tos, dev);
2377         rcu_read_unlock();
2378         return res;
2379 }
2380 EXPORT_SYMBOL(ip_route_input_common);
2381
2382 /* called with rcu_read_lock() */
2383 static struct rtable *__mkroute_output(const struct fib_result *res,
2384                                        const struct flowi4 *fl4,
2385                                        __be32 orig_daddr, __be32 orig_saddr,
2386                                        int orig_oif, struct net_device *dev_out,
2387                                        unsigned int flags)
2388 {
2389         struct fib_info *fi = res->fi;
2390         u32 tos = RT_FL_TOS(fl4);
2391         struct in_device *in_dev;
2392         u16 type = res->type;
2393         struct rtable *rth;
2394
2395         if (ipv4_is_loopback(fl4->saddr) && !(dev_out->flags & IFF_LOOPBACK))
2396                 return ERR_PTR(-EINVAL);
2397
2398         if (ipv4_is_lbcast(fl4->daddr))
2399                 type = RTN_BROADCAST;
2400         else if (ipv4_is_multicast(fl4->daddr))
2401                 type = RTN_MULTICAST;
2402         else if (ipv4_is_zeronet(fl4->daddr))
2403                 return ERR_PTR(-EINVAL);
2404
2405         if (dev_out->flags & IFF_LOOPBACK)
2406                 flags |= RTCF_LOCAL;
2407
2408         in_dev = __in_dev_get_rcu(dev_out);
2409         if (!in_dev)
2410                 return ERR_PTR(-EINVAL);
2411
2412         if (type == RTN_BROADCAST) {
2413                 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2414                 fi = NULL;
2415         } else if (type == RTN_MULTICAST) {
2416                 flags |= RTCF_MULTICAST | RTCF_LOCAL;
2417                 if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
2418                                      fl4->flowi4_proto))
2419                         flags &= ~RTCF_LOCAL;
2420                 /* If multicast route do not exist use
2421                  * default one, but do not gateway in this case.
2422                  * Yes, it is hack.
2423                  */
2424                 if (fi && res->prefixlen < 4)
2425                         fi = NULL;
2426         }
2427
2428         rth = rt_dst_alloc(dev_out,
2429                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
2430                            IN_DEV_CONF_GET(in_dev, NOXFRM));
2431         if (!rth)
2432                 return ERR_PTR(-ENOBUFS);
2433
2434         rth->dst.output = ip_output;
2435
2436         rth->rt_key_dst = orig_daddr;
2437         rth->rt_key_src = orig_saddr;
2438         rth->rt_genid = rt_genid(dev_net(dev_out));
2439         rth->rt_flags   = flags;
2440         rth->rt_type    = type;
2441         rth->rt_key_tos = tos;
2442         rth->rt_dst     = fl4->daddr;
2443         rth->rt_src     = fl4->saddr;
2444         rth->rt_route_iif = 0;
2445         rth->rt_iif     = orig_oif ? : dev_out->ifindex;
2446         rth->rt_oif     = orig_oif;
2447         rth->rt_mark    = fl4->flowi4_mark;
2448         rth->rt_gateway = fl4->daddr;
2449         rth->rt_spec_dst= fl4->saddr;
2450         rth->rt_peer_genid = 0;
2451         rth->peer = NULL;
2452         rth->fi = NULL;
2453
2454         RT_CACHE_STAT_INC(out_slow_tot);
2455
2456         if (flags & RTCF_LOCAL) {
2457                 rth->dst.input = ip_local_deliver;
2458                 rth->rt_spec_dst = fl4->daddr;
2459         }
2460         if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2461                 rth->rt_spec_dst = fl4->saddr;
2462                 if (flags & RTCF_LOCAL &&
2463                     !(dev_out->flags & IFF_LOOPBACK)) {
2464                         rth->dst.output = ip_mc_output;
2465                         RT_CACHE_STAT_INC(out_slow_mc);
2466                 }
2467 #ifdef CONFIG_IP_MROUTE
2468                 if (type == RTN_MULTICAST) {
2469                         if (IN_DEV_MFORWARD(in_dev) &&
2470                             !ipv4_is_local_multicast(fl4->daddr)) {
2471                                 rth->dst.input = ip_mr_input;
2472                                 rth->dst.output = ip_mc_output;
2473                         }
2474                 }
2475 #endif
2476         }
2477
2478         rt_set_nexthop(rth, fl4, res, fi, type, 0);
2479
2480         return rth;
2481 }
2482
2483 /*
2484  * Major route resolver routine.
2485  * called with rcu_read_lock();
2486  */
2487
2488 static struct rtable *ip_route_output_slow(struct net *net, struct flowi4 *fl4)
2489 {
2490         struct net_device *dev_out = NULL;
2491         u32 tos = RT_FL_TOS(fl4);
2492         unsigned int flags = 0;
2493         struct fib_result res;
2494         struct rtable *rth;
2495         __be32 orig_daddr;
2496         __be32 orig_saddr;
2497         int orig_oif;
2498
2499         res.fi          = NULL;
2500 #ifdef CONFIG_IP_MULTIPLE_TABLES
2501         res.r           = NULL;
2502 #endif
2503
2504         orig_daddr = fl4->daddr;
2505         orig_saddr = fl4->saddr;
2506         orig_oif = fl4->flowi4_oif;
2507
2508         fl4->flowi4_iif = net->loopback_dev->ifindex;
2509         fl4->flowi4_tos = tos & IPTOS_RT_MASK;
2510         fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
2511                          RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
2512
2513         rcu_read_lock();
2514         if (fl4->saddr) {
2515                 rth = ERR_PTR(-EINVAL);
2516                 if (ipv4_is_multicast(fl4->saddr) ||
2517                     ipv4_is_lbcast(fl4->saddr) ||
2518                     ipv4_is_zeronet(fl4->saddr))
2519                         goto out;
2520
2521                 /* I removed check for oif == dev_out->oif here.
2522                    It was wrong for two reasons:
2523                    1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2524                       is assigned to multiple interfaces.
2525                    2. Moreover, we are allowed to send packets with saddr
2526                       of another iface. --ANK
2527                  */
2528
2529                 if (fl4->flowi4_oif == 0 &&
2530                     (ipv4_is_multicast(fl4->daddr) ||
2531                      ipv4_is_lbcast(fl4->daddr))) {
2532                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2533                         dev_out = __ip_dev_find(net, fl4->saddr, false);
2534                         if (dev_out == NULL)
2535                                 goto out;
2536
2537                         /* Special hack: user can direct multicasts
2538                            and limited broadcast via necessary interface
2539                            without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2540                            This hack is not just for fun, it allows
2541                            vic,vat and friends to work.
2542                            They bind socket to loopback, set ttl to zero
2543                            and expect that it will work.
2544                            From the viewpoint of routing cache they are broken,
2545                            because we are not allowed to build multicast path
2546                            with loopback source addr (look, routing cache
2547                            cannot know, that ttl is zero, so that packet
2548                            will not leave this host and route is valid).
2549                            Luckily, this hack is good workaround.
2550                          */
2551
2552                         fl4->flowi4_oif = dev_out->ifindex;
2553                         goto make_route;
2554                 }
2555
2556                 if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
2557                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2558                         if (!__ip_dev_find(net, fl4->saddr, false))
2559                                 goto out;
2560                 }
2561         }
2562
2563
2564         if (fl4->flowi4_oif) {
2565                 dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
2566                 rth = ERR_PTR(-ENODEV);
2567                 if (dev_out == NULL)
2568                         goto out;
2569
2570                 /* RACE: Check return value of inet_select_addr instead. */
2571                 if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
2572                         rth = ERR_PTR(-ENETUNREACH);
2573                         goto out;
2574                 }
2575                 if (ipv4_is_local_multicast(fl4->daddr) ||
2576                     ipv4_is_lbcast(fl4->daddr)) {
2577                         if (!fl4->saddr)
2578                                 fl4->saddr = inet_select_addr(dev_out, 0,
2579                                                               RT_SCOPE_LINK);
2580                         goto make_route;
2581                 }
2582                 if (fl4->saddr) {
2583                         if (ipv4_is_multicast(fl4->daddr))
2584                                 fl4->saddr = inet_select_addr(dev_out, 0,
2585                                                               fl4->flowi4_scope);
2586                         else if (!fl4->daddr)
2587                                 fl4->saddr = inet_select_addr(dev_out, 0,
2588                                                               RT_SCOPE_HOST);
2589                 }
2590         }
2591
2592         if (!fl4->daddr) {
2593                 fl4->daddr = fl4->saddr;
2594                 if (!fl4->daddr)
2595                         fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
2596                 dev_out = net->loopback_dev;
2597                 fl4->flowi4_oif = net->loopback_dev->ifindex;
2598                 res.type = RTN_LOCAL;
2599                 flags |= RTCF_LOCAL;
2600                 goto make_route;
2601         }
2602
2603         if (fib_lookup(net, fl4, &res)) {
2604                 res.fi = NULL;
2605                 if (fl4->flowi4_oif) {
2606                         /* Apparently, routing tables are wrong. Assume,
2607                            that the destination is on link.
2608
2609                            WHY? DW.
2610                            Because we are allowed to send to iface
2611                            even if it has NO routes and NO assigned
2612                            addresses. When oif is specified, routing
2613                            tables are looked up with only one purpose:
2614                            to catch if destination is gatewayed, rather than
2615                            direct. Moreover, if MSG_DONTROUTE is set,
2616                            we send packet, ignoring both routing tables
2617                            and ifaddr state. --ANK
2618
2619
2620                            We could make it even if oif is unknown,
2621                            likely IPv6, but we do not.
2622                          */
2623
2624                         if (fl4->saddr == 0)
2625                                 fl4->saddr = inet_select_addr(dev_out, 0,
2626                                                               RT_SCOPE_LINK);
2627                         res.type = RTN_UNICAST;
2628                         goto make_route;
2629                 }
2630                 rth = ERR_PTR(-ENETUNREACH);
2631                 goto out;
2632         }
2633
2634         if (res.type == RTN_LOCAL) {
2635                 if (!fl4->saddr) {
2636                         if (res.fi->fib_prefsrc)
2637                                 fl4->saddr = res.fi->fib_prefsrc;
2638                         else
2639                                 fl4->saddr = fl4->daddr;
2640                 }
2641                 dev_out = net->loopback_dev;
2642                 fl4->flowi4_oif = dev_out->ifindex;
2643                 res.fi = NULL;
2644                 flags |= RTCF_LOCAL;
2645                 goto make_route;
2646         }
2647
2648 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2649         if (res.fi->fib_nhs > 1 && fl4->flowi4_oif == 0)
2650                 fib_select_multipath(&res);
2651         else
2652 #endif
2653         if (!res.prefixlen &&
2654             res.table->tb_num_default > 1 &&
2655             res.type == RTN_UNICAST && !fl4->flowi4_oif)
2656                 fib_select_default(&res);
2657
2658         if (!fl4->saddr)
2659                 fl4->saddr = FIB_RES_PREFSRC(net, res);
2660
2661         dev_out = FIB_RES_DEV(res);
2662         fl4->flowi4_oif = dev_out->ifindex;
2663
2664
2665 make_route:
2666         rth = __mkroute_output(&res, fl4, orig_daddr, orig_saddr, orig_oif,
2667                                dev_out, flags);
2668         if (!IS_ERR(rth)) {
2669                 unsigned int hash;
2670
2671                 hash = rt_hash(orig_daddr, orig_saddr, orig_oif,
2672                                rt_genid(dev_net(dev_out)));
2673                 rth = rt_intern_hash(hash, rth, NULL, orig_oif);
2674         }
2675
2676 out:
2677         rcu_read_unlock();
2678         return rth;
2679 }
2680
2681 struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *flp4)
2682 {
2683         struct rtable *rth;
2684         unsigned int hash;
2685
2686         if (!rt_caching(net))
2687                 goto slow_output;
2688
2689         hash = rt_hash(flp4->daddr, flp4->saddr, flp4->flowi4_oif, rt_genid(net));
2690
2691         rcu_read_lock_bh();
2692         for (rth = rcu_dereference_bh(rt_hash_table[hash].chain); rth;
2693                 rth = rcu_dereference_bh(rth->dst.rt_next)) {
2694                 if (rth->rt_key_dst == flp4->daddr &&
2695                     rth->rt_key_src == flp4->saddr &&
2696                     rt_is_output_route(rth) &&
2697                     rth->rt_oif == flp4->flowi4_oif &&
2698                     rth->rt_mark == flp4->flowi4_mark &&
2699                     !((rth->rt_key_tos ^ flp4->flowi4_tos) &
2700                             (IPTOS_RT_MASK | RTO_ONLINK)) &&
2701                     net_eq(dev_net(rth->dst.dev), net) &&
2702                     !rt_is_expired(rth)) {
2703                         dst_use(&rth->dst, jiffies);
2704                         RT_CACHE_STAT_INC(out_hit);
2705                         rcu_read_unlock_bh();
2706                         if (!flp4->saddr)
2707                                 flp4->saddr = rth->rt_src;
2708                         if (!flp4->daddr)
2709                                 flp4->daddr = rth->rt_dst;
2710                         return rth;
2711                 }
2712                 RT_CACHE_STAT_INC(out_hlist_search);
2713         }
2714         rcu_read_unlock_bh();
2715
2716 slow_output:
2717         return ip_route_output_slow(net, flp4);
2718 }
2719 EXPORT_SYMBOL_GPL(__ip_route_output_key);
2720
2721 static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2722 {
2723         return NULL;
2724 }
2725
2726 static unsigned int ipv4_blackhole_default_mtu(const struct dst_entry *dst)
2727 {
2728         return 0;
2729 }
2730
2731 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
2732 {
2733 }
2734
2735 static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2736                                           unsigned long old)
2737 {
2738         return NULL;
2739 }
2740
2741 static struct dst_ops ipv4_dst_blackhole_ops = {
2742         .family                 =       AF_INET,
2743         .protocol               =       cpu_to_be16(ETH_P_IP),
2744         .destroy                =       ipv4_dst_destroy,
2745         .check                  =       ipv4_blackhole_dst_check,
2746         .default_mtu            =       ipv4_blackhole_default_mtu,
2747         .default_advmss         =       ipv4_default_advmss,
2748         .update_pmtu            =       ipv4_rt_blackhole_update_pmtu,
2749         .cow_metrics            =       ipv4_rt_blackhole_cow_metrics,
2750         .neigh_lookup           =       ipv4_neigh_lookup,
2751 };
2752
2753 struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2754 {
2755         struct rtable *rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, 0, 0);
2756         struct rtable *ort = (struct rtable *) dst_orig;
2757
2758         if (rt) {
2759                 struct dst_entry *new = &rt->dst;
2760
2761                 new->__use = 1;
2762                 new->input = dst_discard;
2763                 new->output = dst_discard;
2764                 dst_copy_metrics(new, &ort->dst);
2765
2766                 new->dev = ort->dst.dev;
2767                 if (new->dev)
2768                         dev_hold(new->dev);
2769
2770                 rt->rt_key_dst = ort->rt_key_dst;
2771                 rt->rt_key_src = ort->rt_key_src;
2772                 rt->rt_key_tos = ort->rt_key_tos;
2773                 rt->rt_route_iif = ort->rt_route_iif;
2774                 rt->rt_iif = ort->rt_iif;
2775                 rt->rt_oif = ort->rt_oif;
2776                 rt->rt_mark = ort->rt_mark;
2777
2778                 rt->rt_genid = rt_genid(net);
2779                 rt->rt_flags = ort->rt_flags;
2780                 rt->rt_type = ort->rt_type;
2781                 rt->rt_dst = ort->rt_dst;
2782                 rt->rt_src = ort->rt_src;
2783                 rt->rt_gateway = ort->rt_gateway;
2784                 rt->rt_spec_dst = ort->rt_spec_dst;
2785                 rt->peer = ort->peer;
2786                 if (rt->peer)
2787                         atomic_inc(&rt->peer->refcnt);
2788                 rt->fi = ort->fi;
2789                 if (rt->fi)
2790                         atomic_inc(&rt->fi->fib_clntref);
2791
2792                 dst_free(new);
2793         }
2794
2795         dst_release(dst_orig);
2796
2797         return rt ? &rt->dst : ERR_PTR(-ENOMEM);
2798 }
2799
2800 struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2801                                     struct sock *sk)
2802 {
2803         struct rtable *rt = __ip_route_output_key(net, flp4);
2804
2805         if (IS_ERR(rt))
2806                 return rt;
2807
2808         if (flp4->flowi4_proto)
2809                 rt = (struct rtable *) xfrm_lookup(net, &rt->dst,
2810                                                    flowi4_to_flowi(flp4),
2811                                                    sk, 0);
2812
2813         return rt;
2814 }
2815 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2816
2817 static int rt_fill_info(struct net *net,
2818                         struct sk_buff *skb, u32 pid, u32 seq, int event,
2819                         int nowait, unsigned int flags)
2820 {
2821         struct rtable *rt = skb_rtable(skb);
2822         struct rtmsg *r;
2823         struct nlmsghdr *nlh;
2824         long expires = 0;
2825         const struct inet_peer *peer = rt->peer;
2826         u32 id = 0, ts = 0, tsage = 0, error;
2827
2828         nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
2829         if (nlh == NULL)
2830                 return -EMSGSIZE;
2831
2832         r = nlmsg_data(nlh);
2833         r->rtm_family    = AF_INET;
2834         r->rtm_dst_len  = 32;
2835         r->rtm_src_len  = 0;
2836         r->rtm_tos      = rt->rt_key_tos;
2837         r->rtm_table    = RT_TABLE_MAIN;
2838         NLA_PUT_U32(skb, RTA_TABLE, RT_TABLE_MAIN);
2839         r->rtm_type     = rt->rt_type;
2840         r->rtm_scope    = RT_SCOPE_UNIVERSE;
2841         r->rtm_protocol = RTPROT_UNSPEC;
2842         r->rtm_flags    = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2843         if (rt->rt_flags & RTCF_NOTIFY)
2844                 r->rtm_flags |= RTM_F_NOTIFY;
2845
2846         NLA_PUT_BE32(skb, RTA_DST, rt->rt_dst);
2847
2848         if (rt->rt_key_src) {
2849                 r->rtm_src_len = 32;
2850                 NLA_PUT_BE32(skb, RTA_SRC, rt->rt_key_src);
2851         }
2852         if (rt->dst.dev)
2853                 NLA_PUT_U32(skb, RTA_OIF, rt->dst.dev->ifindex);
2854 #ifdef CONFIG_IP_ROUTE_CLASSID
2855         if (rt->dst.tclassid)
2856                 NLA_PUT_U32(skb, RTA_FLOW, rt->dst.tclassid);
2857 #endif
2858         if (rt_is_input_route(rt))
2859                 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_spec_dst);
2860         else if (rt->rt_src != rt->rt_key_src)
2861                 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_src);
2862
2863         if (rt->rt_dst != rt->rt_gateway)
2864                 NLA_PUT_BE32(skb, RTA_GATEWAY, rt->rt_gateway);
2865
2866         if (rtnetlink_put_metrics(skb, dst_metrics_ptr(&rt->dst)) < 0)
2867                 goto nla_put_failure;
2868
2869         if (rt->rt_mark)
2870                 NLA_PUT_BE32(skb, RTA_MARK, rt->rt_mark);
2871
2872         error = rt->dst.error;
2873         if (peer) {
2874                 inet_peer_refcheck(rt->peer);
2875                 id = atomic_read(&peer->ip_id_count) & 0xffff;
2876                 if (peer->tcp_ts_stamp) {
2877                         ts = peer->tcp_ts;
2878                         tsage = get_seconds() - peer->tcp_ts_stamp;
2879                 }
2880                 expires = ACCESS_ONCE(peer->pmtu_expires);
2881                 if (expires)
2882                         expires -= jiffies;
2883         }
2884
2885         if (rt_is_input_route(rt)) {
2886 #ifdef CONFIG_IP_MROUTE
2887                 __be32 dst = rt->rt_dst;
2888
2889                 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
2890                     IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2891                         int err = ipmr_get_route(net, skb,
2892                                                  rt->rt_src, rt->rt_dst,
2893                                                  r, nowait);
2894                         if (err <= 0) {
2895                                 if (!nowait) {
2896                                         if (err == 0)
2897                                                 return 0;
2898                                         goto nla_put_failure;
2899                                 } else {
2900                                         if (err == -EMSGSIZE)
2901                                                 goto nla_put_failure;
2902                                         error = err;
2903                                 }
2904                         }
2905                 } else
2906 #endif
2907                         NLA_PUT_U32(skb, RTA_IIF, rt->rt_iif);
2908         }
2909
2910         if (rtnl_put_cacheinfo(skb, &rt->dst, id, ts, tsage,
2911                                expires, error) < 0)
2912                 goto nla_put_failure;
2913
2914         return nlmsg_end(skb, nlh);
2915
2916 nla_put_failure:
2917         nlmsg_cancel(skb, nlh);
2918         return -EMSGSIZE;
2919 }
2920
2921 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2922 {
2923         struct net *net = sock_net(in_skb->sk);
2924         struct rtmsg *rtm;
2925         struct nlattr *tb[RTA_MAX+1];
2926         struct rtable *rt = NULL;
2927         __be32 dst = 0;
2928         __be32 src = 0;
2929         u32 iif;
2930         int err;
2931         int mark;
2932         struct sk_buff *skb;
2933
2934         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
2935         if (err < 0)
2936                 goto errout;
2937
2938         rtm = nlmsg_data(nlh);
2939
2940         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2941         if (skb == NULL) {
2942                 err = -ENOBUFS;
2943                 goto errout;
2944         }
2945
2946         /* Reserve room for dummy headers, this skb can pass
2947            through good chunk of routing engine.
2948          */
2949         skb_reset_mac_header(skb);
2950         skb_reset_network_header(skb);
2951
2952         /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
2953         ip_hdr(skb)->protocol = IPPROTO_ICMP;
2954         skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2955
2956         src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
2957         dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
2958         iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
2959         mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
2960
2961         if (iif) {
2962                 struct net_device *dev;
2963
2964                 dev = __dev_get_by_index(net, iif);
2965                 if (dev == NULL) {
2966                         err = -ENODEV;
2967                         goto errout_free;
2968                 }
2969
2970                 skb->protocol   = htons(ETH_P_IP);
2971                 skb->dev        = dev;
2972                 skb->mark       = mark;
2973                 local_bh_disable();
2974                 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2975                 local_bh_enable();
2976
2977                 rt = skb_rtable(skb);
2978                 if (err == 0 && rt->dst.error)
2979                         err = -rt->dst.error;
2980         } else {
2981                 struct flowi4 fl4 = {
2982                         .daddr = dst,
2983                         .saddr = src,
2984                         .flowi4_tos = rtm->rtm_tos,
2985                         .flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
2986                         .flowi4_mark = mark,
2987                 };
2988                 rt = ip_route_output_key(net, &fl4);
2989
2990                 err = 0;
2991                 if (IS_ERR(rt))
2992                         err = PTR_ERR(rt);
2993         }
2994
2995         if (err)
2996                 goto errout_free;
2997
2998         skb_dst_set(skb, &rt->dst);
2999         if (rtm->rtm_flags & RTM_F_NOTIFY)
3000                 rt->rt_flags |= RTCF_NOTIFY;
3001
3002         err = rt_fill_info(net, skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
3003                            RTM_NEWROUTE, 0, 0);
3004         if (err <= 0)
3005                 goto errout_free;
3006
3007         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
3008 errout:
3009         return err;
3010
3011 errout_free:
3012         kfree_skb(skb);
3013         goto errout;
3014 }
3015
3016 int ip_rt_dump(struct sk_buff *skb,  struct netlink_callback *cb)
3017 {
3018         struct rtable *rt;
3019         int h, s_h;
3020         int idx, s_idx;
3021         struct net *net;
3022
3023         net = sock_net(skb->sk);
3024
3025         s_h = cb->args[0];
3026         if (s_h < 0)
3027                 s_h = 0;
3028         s_idx = idx = cb->args[1];
3029         for (h = s_h; h <= rt_hash_mask; h++, s_idx = 0) {
3030                 if (!rt_hash_table[h].chain)
3031                         continue;
3032                 rcu_read_lock_bh();
3033                 for (rt = rcu_dereference_bh(rt_hash_table[h].chain), idx = 0; rt;
3034                      rt = rcu_dereference_bh(rt->dst.rt_next), idx++) {
3035                         if (!net_eq(dev_net(rt->dst.dev), net) || idx < s_idx)
3036                                 continue;
3037                         if (rt_is_expired(rt))
3038                                 continue;
3039                         skb_dst_set_noref(skb, &rt->dst);
3040                         if (rt_fill_info(net, skb, NETLINK_CB(cb->skb).pid,
3041                                          cb->nlh->nlmsg_seq, RTM_NEWROUTE,
3042                                          1, NLM_F_MULTI) <= 0) {
3043                                 skb_dst_drop(skb);
3044                                 rcu_read_unlock_bh();
3045                                 goto done;
3046                         }
3047                         skb_dst_drop(skb);
3048                 }
3049                 rcu_read_unlock_bh();
3050         }
3051
3052 done:
3053         cb->args[0] = h;
3054         cb->args[1] = idx;
3055         return skb->len;
3056 }
3057
3058 void ip_rt_multicast_event(struct in_device *in_dev)
3059 {
3060         rt_cache_flush(dev_net(in_dev->dev), 0);
3061 }
3062
3063 #ifdef CONFIG_SYSCTL
3064 static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write,
3065                                         void __user *buffer,
3066                                         size_t *lenp, loff_t *ppos)
3067 {
3068         if (write) {
3069                 int flush_delay;
3070                 ctl_table ctl;
3071                 struct net *net;
3072
3073                 memcpy(&ctl, __ctl, sizeof(ctl));
3074                 ctl.data = &flush_delay;
3075                 proc_dointvec(&ctl, write, buffer, lenp, ppos);
3076
3077                 net = (struct net *)__ctl->extra1;
3078                 rt_cache_flush(net, flush_delay);
3079                 return 0;
3080         }
3081
3082         return -EINVAL;
3083 }
3084
3085 static ctl_table ipv4_route_table[] = {
3086         {
3087                 .procname       = "gc_thresh",
3088                 .data           = &ipv4_dst_ops.gc_thresh,
3089                 .maxlen         = sizeof(int),
3090                 .mode           = 0644,
3091                 .proc_handler   = proc_dointvec,
3092         },
3093         {
3094                 .procname       = "max_size",
3095                 .data           = &ip_rt_max_size,
3096                 .maxlen         = sizeof(int),
3097                 .mode           = 0644,
3098                 .proc_handler   = proc_dointvec,
3099         },
3100         {
3101                 /*  Deprecated. Use gc_min_interval_ms */
3102
3103                 .procname       = "gc_min_interval",
3104                 .data           = &ip_rt_gc_min_interval,
3105                 .maxlen         = sizeof(int),
3106                 .mode           = 0644,
3107                 .proc_handler   = proc_dointvec_jiffies,
3108         },
3109         {
3110                 .procname       = "gc_min_interval_ms",
3111                 .data           = &ip_rt_gc_min_interval,
3112                 .maxlen         = sizeof(int),
3113                 .mode           = 0644,
3114                 .proc_handler   = proc_dointvec_ms_jiffies,
3115         },
3116         {
3117                 .procname       = "gc_timeout",
3118                 .data           = &ip_rt_gc_timeout,
3119                 .maxlen         = sizeof(int),
3120                 .mode           = 0644,
3121                 .proc_handler   = proc_dointvec_jiffies,
3122         },
3123         {
3124                 .procname       = "gc_interval",
3125                 .data           = &ip_rt_gc_interval,
3126                 .maxlen         = sizeof(int),
3127                 .mode           = 0644,
3128                 .proc_handler   = proc_dointvec_jiffies,
3129         },
3130         {
3131                 .procname       = "redirect_load",
3132                 .data           = &ip_rt_redirect_load,
3133                 .maxlen         = sizeof(int),
3134                 .mode           = 0644,
3135                 .proc_handler   = proc_dointvec,
3136         },
3137         {
3138                 .procname       = "redirect_number",
3139                 .data           = &ip_rt_redirect_number,
3140                 .maxlen         = sizeof(int),
3141                 .mode           = 0644,
3142                 .proc_handler   = proc_dointvec,
3143         },
3144         {
3145                 .procname       = "redirect_silence",
3146                 .data           = &ip_rt_redirect_silence,
3147                 .maxlen         = sizeof(int),
3148                 .mode           = 0644,
3149                 .proc_handler   = proc_dointvec,
3150         },
3151         {
3152                 .procname       = "error_cost",
3153                 .data           = &ip_rt_error_cost,
3154                 .maxlen         = sizeof(int),
3155                 .mode           = 0644,
3156                 .proc_handler   = proc_dointvec,
3157         },
3158         {
3159                 .procname       = "error_burst",
3160                 .data           = &ip_rt_error_burst,
3161                 .maxlen         = sizeof(int),
3162                 .mode           = 0644,
3163                 .proc_handler   = proc_dointvec,
3164         },
3165         {
3166                 .procname       = "gc_elasticity",
3167                 .data           = &ip_rt_gc_elasticity,
3168                 .maxlen         = sizeof(int),
3169                 .mode           = 0644,
3170                 .proc_handler   = proc_dointvec,
3171         },
3172         {
3173                 .procname       = "mtu_expires",
3174                 .data           = &ip_rt_mtu_expires,
3175                 .maxlen         = sizeof(int),
3176                 .mode           = 0644,
3177                 .proc_handler   = proc_dointvec_jiffies,
3178         },
3179         {
3180                 .procname       = "min_pmtu",
3181                 .data           = &ip_rt_min_pmtu,
3182                 .maxlen         = sizeof(int),
3183                 .mode           = 0644,
3184                 .proc_handler   = proc_dointvec,
3185         },
3186         {
3187                 .procname       = "min_adv_mss",
3188                 .data           = &ip_rt_min_advmss,
3189                 .maxlen         = sizeof(int),
3190                 .mode           = 0644,
3191                 .proc_handler   = proc_dointvec,
3192         },
3193         { }
3194 };
3195
3196 static struct ctl_table empty[1];
3197
3198 static struct ctl_table ipv4_skeleton[] =
3199 {
3200         { .procname = "route",
3201           .mode = 0555, .child = ipv4_route_table},
3202         { .procname = "neigh",
3203           .mode = 0555, .child = empty},
3204         { }
3205 };
3206
3207 static __net_initdata struct ctl_path ipv4_path[] = {
3208         { .procname = "net", },
3209         { .procname = "ipv4", },
3210         { },
3211 };
3212
3213 static struct ctl_table ipv4_route_flush_table[] = {
3214         {
3215                 .procname       = "flush",
3216                 .maxlen         = sizeof(int),
3217                 .mode           = 0200,
3218                 .proc_handler   = ipv4_sysctl_rtcache_flush,
3219         },
3220         { },
3221 };
3222
3223 static __net_initdata struct ctl_path ipv4_route_path[] = {
3224         { .procname = "net", },
3225         { .procname = "ipv4", },
3226         { .procname = "route", },
3227         { },
3228 };
3229
3230 static __net_init int sysctl_route_net_init(struct net *net)
3231 {
3232         struct ctl_table *tbl;
3233
3234         tbl = ipv4_route_flush_table;
3235         if (!net_eq(net, &init_net)) {
3236                 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
3237                 if (tbl == NULL)
3238                         goto err_dup;
3239         }
3240         tbl[0].extra1 = net;
3241
3242         net->ipv4.route_hdr =
3243                 register_net_sysctl_table(net, ipv4_route_path, tbl);
3244         if (net->ipv4.route_hdr == NULL)
3245                 goto err_reg;
3246         return 0;
3247
3248 err_reg:
3249         if (tbl != ipv4_route_flush_table)
3250                 kfree(tbl);
3251 err_dup:
3252         return -ENOMEM;
3253 }
3254
3255 static __net_exit void sysctl_route_net_exit(struct net *net)
3256 {
3257         struct ctl_table *tbl;
3258
3259         tbl = net->ipv4.route_hdr->ctl_table_arg;
3260         unregister_net_sysctl_table(net->ipv4.route_hdr);
3261         BUG_ON(tbl == ipv4_route_flush_table);
3262         kfree(tbl);
3263 }
3264
3265 static __net_initdata struct pernet_operations sysctl_route_ops = {
3266         .init = sysctl_route_net_init,
3267         .exit = sysctl_route_net_exit,
3268 };
3269 #endif
3270
3271 static __net_init int rt_genid_init(struct net *net)
3272 {
3273         get_random_bytes(&net->ipv4.rt_genid,
3274                          sizeof(net->ipv4.rt_genid));
3275         get_random_bytes(&net->ipv4.dev_addr_genid,
3276                          sizeof(net->ipv4.dev_addr_genid));
3277         return 0;
3278 }
3279
3280 static __net_initdata struct pernet_operations rt_genid_ops = {
3281         .init = rt_genid_init,
3282 };
3283
3284
3285 #ifdef CONFIG_IP_ROUTE_CLASSID
3286 struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
3287 #endif /* CONFIG_IP_ROUTE_CLASSID */
3288
3289 static __initdata unsigned long rhash_entries;
3290 static int __init set_rhash_entries(char *str)
3291 {
3292         if (!str)
3293                 return 0;
3294         rhash_entries = simple_strtoul(str, &str, 0);
3295         return 1;
3296 }
3297 __setup("rhash_entries=", set_rhash_entries);
3298
3299 int __init ip_rt_init(void)
3300 {
3301         int rc = 0;
3302
3303 #ifdef CONFIG_IP_ROUTE_CLASSID
3304         ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
3305         if (!ip_rt_acct)
3306                 panic("IP: failed to allocate ip_rt_acct\n");
3307 #endif
3308
3309         ipv4_dst_ops.kmem_cachep =
3310                 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3311                                   SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3312
3313         ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3314
3315         if (dst_entries_init(&ipv4_dst_ops) < 0)
3316                 panic("IP: failed to allocate ipv4_dst_ops counter\n");
3317
3318         if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
3319                 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
3320
3321         rt_hash_table = (struct rt_hash_bucket *)
3322                 alloc_large_system_hash("IP route cache",
3323                                         sizeof(struct rt_hash_bucket),
3324                                         rhash_entries,
3325                                         (totalram_pages >= 128 * 1024) ?
3326                                         15 : 17,
3327                                         0,
3328                                         &rt_hash_log,
3329                                         &rt_hash_mask,
3330                                         rhash_entries ? 0 : 512 * 1024);
3331         memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
3332         rt_hash_lock_init();
3333
3334         ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
3335         ip_rt_max_size = (rt_hash_mask + 1) * 16;
3336
3337         devinet_init();
3338         ip_fib_init();
3339
3340         if (ip_rt_proc_init())
3341                 printk(KERN_ERR "Unable to create route proc files\n");
3342 #ifdef CONFIG_XFRM
3343         xfrm_init();
3344         xfrm4_init(ip_rt_max_size);
3345 #endif
3346         rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL, NULL);
3347
3348 #ifdef CONFIG_SYSCTL
3349         register_pernet_subsys(&sysctl_route_ops);
3350 #endif
3351         register_pernet_subsys(&rt_genid_ops);
3352         return rc;
3353 }
3354
3355 #ifdef CONFIG_SYSCTL
3356 /*
3357  * We really need to sanitize the damn ipv4 init order, then all
3358  * this nonsense will go away.
3359  */
3360 void __init ip_static_sysctl_init(void)
3361 {
3362         register_sysctl_paths(ipv4_path, ipv4_skeleton);
3363 }
3364 #endif